Capstone Project¶

In [1]:
import pandas as pd
import seaborn as sb
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
import plotly.graph_objs as go
import plotly.offline as pyoff

from __future__ import division
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing 
import scipy.cluster.hierarchy as sch
from sklearn.cluster import AgglomerativeClustering 

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics 

from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings('ignore')
In [4]:
data1 = pd.read_csv("/Users/nartneyu13/Desktop/CLV data - Capstone Project/Online_Sales.csv")
data2 = pd.read_excel("/Users/nartneyu13/Desktop/CLV data - Capstone Project/CustomersData.xlsx")
data3 = pd.read_csv("/Users/nartneyu13/Desktop/CLV data - Capstone Project/Discount_Coupon.csv")
data4 = pd.read_csv("/Users/nartneyu13/Desktop/CLV data - Capstone Project/Marketing_Spend.csv")
data5 = pd.read_excel("/Users/nartneyu13/Desktop/CLV data - Capstone Project/Tax_amount.xlsx")

Data Exploratory¶

In [5]:
data1.info()
data1.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52924 entries, 0 to 52923
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   CustomerID           52924 non-null  int64  
 1   Transaction_ID       52924 non-null  int64  
 2   Transaction_Date     52924 non-null  object 
 3   Product_SKU          52924 non-null  object 
 4   Product_Description  52924 non-null  object 
 5   Product_Category     52924 non-null  object 
 6   Quantity             52924 non-null  int64  
 7   Avg_Price            52924 non-null  float64
 8   Delivery_Charges     52924 non-null  float64
 9   Coupon_Status        52924 non-null  object 
dtypes: float64(2), int64(3), object(5)
memory usage: 4.0+ MB
Out[5]:
CustomerID Transaction_ID Transaction_Date Product_SKU Product_Description Product_Category Quantity Avg_Price Delivery_Charges Coupon_Status
0 17850 16679 1/1/2019 GGOENEBJ079499 Nest Learning Thermostat 3rd Gen-USA - Stainle... Nest-USA 1 153.71 6.5 Used
1 17850 16680 1/1/2019 GGOENEBJ079499 Nest Learning Thermostat 3rd Gen-USA - Stainle... Nest-USA 1 153.71 6.5 Used
2 17850 16681 1/1/2019 GGOEGFKQ020399 Google Laptop and Cell Phone Stickers Office 1 2.05 6.5 Used
3 17850 16682 1/1/2019 GGOEGAAB010516 Google Men's 100% Cotton Short Sleeve Hero Tee... Apparel 5 17.53 6.5 Not Used
4 17850 16682 1/1/2019 GGOEGBJL013999 Google Canvas Tote Natural/Navy Bags 1 16.50 6.5 Used
In [6]:
data2.info()
data2.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1468 entries, 0 to 1467
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   CustomerID     1468 non-null   int64 
 1   Gender         1468 non-null   object
 2   Location       1468 non-null   object
 3   Tenure_Months  1468 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 46.0+ KB
Out[6]:
CustomerID Gender Location Tenure_Months
0 17850 M Chicago 12
1 13047 M California 43
2 12583 M Chicago 33
3 13748 F California 30
4 15100 M California 49
In [7]:
data3.info()
data3.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 204 entries, 0 to 203
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Month             204 non-null    object
 1   Product_Category  204 non-null    object
 2   Coupon_Code       204 non-null    object
 3   Discount_pct      204 non-null    int64 
dtypes: int64(1), object(3)
memory usage: 6.5+ KB
Out[7]:
Month Product_Category Coupon_Code Discount_pct
0 Jan Apparel SALE10 10
1 Feb Apparel SALE20 20
2 Mar Apparel SALE30 30
3 Jan Nest-USA ELEC10 10
4 Feb Nest-USA ELEC20 20
In [8]:
data4.info()
data4.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 365 entries, 0 to 364
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Date           365 non-null    object 
 1   Offline_Spend  365 non-null    int64  
 2   Online_Spend   365 non-null    float64
dtypes: float64(1), int64(1), object(1)
memory usage: 8.7+ KB
Out[8]:
Date Offline_Spend Online_Spend
0 1/1/2019 4500 2424.50
1 1/2/2019 4500 3480.36
2 1/3/2019 4500 1576.38
3 1/4/2019 4500 2928.55
4 1/5/2019 4500 4055.30
In [9]:
data5.info()
data5.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Product_Category  20 non-null     object 
 1   GST               20 non-null     float64
dtypes: float64(1), object(1)
memory usage: 448.0+ bytes
Out[9]:
Product_Category GST
0 Nest-USA 0.10
1 Office 0.10
2 Apparel 0.18
3 Bags 0.18
4 Drinkware 0.18

Merging datasets¶

In [10]:
data6 = data2.merge(data1, on = "CustomerID")
data7 = data6.merge(data5, on = "Product_Category")
In [11]:
data7=data7.merge(data4, left_on = 'Transaction_Date', right_on = 'Date')
In [12]:
data7['Transaction_Date'] = pd.to_datetime(data7['Transaction_Date'], format = '%m/%d/%Y')
In [13]:
data7['Month'] = data7['Transaction_Date'].dt.strftime('%b')
In [14]:
df = data7.merge(data3, on = ['Month', 'Product_Category'], how = 'left')
In [15]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 52924 entries, 0 to 52923
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   CustomerID           52924 non-null  int64         
 1   Gender               52924 non-null  object        
 2   Location             52924 non-null  object        
 3   Tenure_Months        52924 non-null  int64         
 4   Transaction_ID       52924 non-null  int64         
 5   Transaction_Date     52924 non-null  datetime64[ns]
 6   Product_SKU          52924 non-null  object        
 7   Product_Description  52924 non-null  object        
 8   Product_Category     52924 non-null  object        
 9   Quantity             52924 non-null  int64         
 10  Avg_Price            52924 non-null  float64       
 11  Delivery_Charges     52924 non-null  float64       
 12  Coupon_Status        52924 non-null  object        
 13  GST                  52924 non-null  float64       
 14  Date                 52924 non-null  object        
 15  Offline_Spend        52924 non-null  int64         
 16  Online_Spend         52924 non-null  float64       
 17  Month                52924 non-null  object        
 18  Coupon_Code          52524 non-null  object        
 19  Discount_pct         52524 non-null  float64       
dtypes: datetime64[ns](1), float64(5), int64(5), object(9)
memory usage: 8.5+ MB
In [16]:
df.isnull().sum()
Out[16]:
CustomerID               0
Gender                   0
Location                 0
Tenure_Months            0
Transaction_ID           0
Transaction_Date         0
Product_SKU              0
Product_Description      0
Product_Category         0
Quantity                 0
Avg_Price                0
Delivery_Charges         0
Coupon_Status            0
GST                      0
Date                     0
Offline_Spend            0
Online_Spend             0
Month                    0
Coupon_Code            400
Discount_pct           400
dtype: int64
In [17]:
# Impute missing value with mode
df['Coupon_Code'].fillna(df['Coupon_Code'].mode()[0], inplace = True)
df['Discount_pct'].fillna(df['Discount_pct'].mode()[0], inplace = True)
In [18]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 52924 entries, 0 to 52923
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   CustomerID           52924 non-null  int64         
 1   Gender               52924 non-null  object        
 2   Location             52924 non-null  object        
 3   Tenure_Months        52924 non-null  int64         
 4   Transaction_ID       52924 non-null  int64         
 5   Transaction_Date     52924 non-null  datetime64[ns]
 6   Product_SKU          52924 non-null  object        
 7   Product_Description  52924 non-null  object        
 8   Product_Category     52924 non-null  object        
 9   Quantity             52924 non-null  int64         
 10  Avg_Price            52924 non-null  float64       
 11  Delivery_Charges     52924 non-null  float64       
 12  Coupon_Status        52924 non-null  object        
 13  GST                  52924 non-null  float64       
 14  Date                 52924 non-null  object        
 15  Offline_Spend        52924 non-null  int64         
 16  Online_Spend         52924 non-null  float64       
 17  Month                52924 non-null  object        
 18  Coupon_Code          52924 non-null  object        
 19  Discount_pct         52924 non-null  float64       
dtypes: datetime64[ns](1), float64(5), int64(5), object(9)
memory usage: 8.5+ MB
In [19]:
# Drop the column named "Date" from the DataFrame df
df.drop(columns=['Date'], inplace=True)
In [20]:
df.head()
Out[20]:
CustomerID Gender Location Tenure_Months Transaction_ID Transaction_Date Product_SKU Product_Description Product_Category Quantity Avg_Price Delivery_Charges Coupon_Status GST Offline_Spend Online_Spend Month Coupon_Code Discount_pct
0 17850 M Chicago 12 16679 2019-01-01 GGOENEBJ079499 Nest Learning Thermostat 3rd Gen-USA - Stainle... Nest-USA 1 153.71 6.5 Used 0.1 4500 2424.5 Jan ELEC10 10.0
1 17850 M Chicago 12 16680 2019-01-01 GGOENEBJ079499 Nest Learning Thermostat 3rd Gen-USA - Stainle... Nest-USA 1 153.71 6.5 Used 0.1 4500 2424.5 Jan ELEC10 10.0
2 17850 M Chicago 12 16696 2019-01-01 GGOENEBQ078999 Nest Cam Outdoor Security Camera - USA Nest-USA 2 122.77 6.5 Not Used 0.1 4500 2424.5 Jan ELEC10 10.0
3 17850 M Chicago 12 16699 2019-01-01 GGOENEBQ079099 Nest Protect Smoke + CO White Battery Alarm-USA Nest-USA 1 81.50 6.5 Clicked 0.1 4500 2424.5 Jan ELEC10 10.0
4 17850 M Chicago 12 16700 2019-01-01 GGOENEBJ079499 Nest Learning Thermostat 3rd Gen-USA - Stainle... Nest-USA 1 153.71 6.5 Clicked 0.1 4500 2424.5 Jan ELEC10 10.0
In [21]:
df.describe()
Out[21]:
CustomerID Tenure_Months Transaction_ID Quantity Avg_Price Delivery_Charges GST Offline_Spend Online_Spend Discount_pct
count 52924.00000 52924.000000 52924.000000 52924.000000 52924.000000 52924.000000 52924.000000 52924.000000 52924.000000 52924.000000
mean 15346.70981 26.127995 32409.825675 4.497638 52.237646 10.517630 0.137462 2830.914141 1893.109119 19.953518
std 1766.55602 13.478285 8648.668977 20.104711 64.006882 19.475613 0.045825 936.154247 807.014092 8.096509
min 12346.00000 2.000000 16679.000000 1.000000 0.390000 0.000000 0.050000 500.000000 320.250000 10.000000
25% 13869.00000 15.000000 25384.000000 1.000000 5.700000 6.000000 0.100000 2500.000000 1252.630000 10.000000
50% 15311.00000 27.000000 32625.500000 1.000000 16.990000 6.000000 0.180000 3000.000000 1837.870000 20.000000
75% 16996.25000 37.000000 39126.250000 2.000000 102.130000 6.500000 0.180000 3500.000000 2425.350000 30.000000
max 18283.00000 50.000000 48497.000000 900.000000 355.740000 521.360000 0.180000 5000.000000 4556.930000 30.000000

Checking for duplicate¶

In [22]:
# Original dimension of the merged dataset
df.shape
Out[22]:
(52924, 19)
In [23]:
df.duplicated().value_counts()
Out[23]:
False    52924
dtype: int64

Outliers¶

Quantity¶

In [24]:
# Round the values to the nearest integer
df['Quantity'] = np.round(df['Quantity']).astype(int)
# Plot the original Quantity histogram distribution
df['Quantity'].plot(kind='hist', bins=10, title='Original Quantity Distribution')
plt.show()

# Log transformation - adding 1 to avoid log(0) and round the transformed values to the nearest integer
df['Quantity_transformed'] = np.round(np.log1p(df['Quantity'])).astype(int)
# Plot the transformed Quantity histogram distribution
df['Quantity_transformed'].plot(kind='hist', bins=10, title='Log Transformed Quantity Distribution')
plt.show()
In [25]:
df['Quantity_transformed'].dtype
Out[25]:
dtype('int64')
In [26]:
upperlimit_Q = df['Quantity_transformed'].quantile(0.99)
lowerlimit_Q = df['Quantity_transformed'].quantile(0.01)
upperlimit_Q, lowerlimit_Q
Out[26]:
(4.0, 1.0)

Delivery Charge¶

In [27]:
# Plot the original Delivery_Charges histogram distribution
df['Delivery_Charges'].plot(kind='hist', bins=10, title='Original Delivery_Charges Distribution')
plt.show()

# Log transformation - adding 1 to avoid log(0)
df['Delivery_Charges_transformed'] = np.log1p(df['Delivery_Charges'])

# Plot the transformed Delivery_Charges histogram distribution
df['Delivery_Charges_transformed'].plot(kind='hist', bins=10, title='Log Transformed Delivery_Charges Distribution')
plt.show()

# Box plot
df['Delivery_Charges_transformed'].plot(kind='box', title='Log Transformed Delivery_Charges Distribution')
Out[27]:
<AxesSubplot:title={'center':'Log Transformed Delivery_Charges Distribution'}>
In [28]:
# Count how many instances that have "Quantity" greater than 2
greater_than_5_count = np.sum(df['Delivery_Charges'] > 100)
greater_than_5_count
Out[28]:
358
In [29]:
upperlimit_D = df['Delivery_Charges_transformed'].quantile(0.99)
lowerlimit_D = df['Delivery_Charges_transformed'].quantile(0.01)
upperlimit_D, lowerlimit_D
Out[29]:
(4.330733340286331, 1.9459101490553132)

Average Price¶

In [30]:
# Plot the original Avg_Price histogram distribution
df['Avg_Price'].plot(kind='hist', bins=10, title='Original Average Price Distribution')
plt.show()

# Log transformation - adding 1 to avoid log(0)
df['Avg_Price_transformed'] = np.log1p(df['Avg_Price'])

# Plot the transformed Quantity histogram distribution
df['Avg_Price_transformed'].plot(kind='hist', bins=10, title='Log Transformed Average Price Distribution')
plt.show()
In [31]:
# Plot the original Online_Spend histogram distribution
df['Online_Spend'].plot(kind='hist', bins=10, title='Original Online Spend Distribution')
plt.show()

# Log transformation - adding 1 to avoid log(0)
df['Online_Spend_transformed'] = np.sqrt(df['Online_Spend'])

# Plot the transformed Quantity histogram distribution
df['Online_Spend_transformed'].plot(kind='hist', bins = 10, title='Log Transformed Online Spend Distribution')
plt.show()

Treating Outliers¶

In [32]:
# define a funtion replace_outliers which return a dataframe with no outliers (using capping method)
# def replace_outliers(df, columns):
 #   for column in columns:
        # Calculate lower and upper bounds using 0.01 and 0.99 quantiles
  #      lower_bound = df[column].quantile(0.01)
  #      upper_bound = df[column].quantile(0.99)
        
        # Replace outliers with lower and upper bounds
   #     df[column] = df[column].apply(lambda x: upper_bound if x > upper_bound else (lower_bound if x < lower_bound else x))
   # return df
In [33]:
# List of columns where outliers should be replaced
#columns_to_replace = ['Offline_Spend', 'Online_Spend']
In [34]:
# Replace outliers in specified columns
#df_cleaned = replace_outliers(df, columns_to_replace)
In [35]:
#df.shape
In [36]:
#df_cleaned.shape
In [37]:
#df_cleaned.describe()

Calculating Total Purchase Amount for each customer¶

Invoice = ((Quantity Avg_price) (1 - Discount_pct) * (1 + GST)) + Delivery_Charges¶

In [38]:
df['Invoice'] = np.where(
    df['Coupon_Status'] == 'Used',
    ((df['Quantity'] * df['Avg_Price']) * (1 - df['Discount_pct']/100) * (1 +df['GST'])) + df['Delivery_Charges'],
    ((df['Quantity'] * df['Avg_Price']) * (1 + df['GST'])) + df['Delivery_Charges']
)

Converting data type¶

In [39]:
df['Product_Category'] = df['Product_Category'].astype('category')
df['Coupon_Status'] = df['Coupon_Status'].astype('category')
df['Gender'] = df['Gender'].astype('category')
df['Location'] = df['Location'].astype('category')
df['Month'] = df['Month'].astype('category')
df['Coupon_Code'] = df['Coupon_Code'].astype('category')
In [40]:
df.describe()
Out[40]:
CustomerID Tenure_Months Transaction_ID Quantity Avg_Price Delivery_Charges GST Offline_Spend Online_Spend Discount_pct Quantity_transformed Delivery_Charges_transformed Avg_Price_transformed Online_Spend_transformed Invoice
count 52924.00000 52924.000000 52924.000000 52924.000000 52924.000000 52924.000000 52924.000000 52924.000000 52924.000000 52924.000000 52924.000000 52924.000000 52924.000000 52924.000000 52924.000000
mean 15346.70981 26.127995 32409.825675 4.497638 52.237646 10.517630 0.137462 2830.914141 1893.109119 19.953518 1.249887 2.179542 3.126378 42.467054 101.947455
std 1766.55602 13.478285 8648.668977 20.104711 64.006882 19.475613 0.045825 936.154247 807.014092 8.096509 0.673419 0.539681 1.399033 9.468905 172.281095
min 12346.00000 2.000000 16679.000000 1.000000 0.390000 0.000000 0.050000 500.000000 320.250000 10.000000 1.000000 0.000000 0.329304 17.895530 4.603500
25% 13869.00000 15.000000 25384.000000 1.000000 5.700000 6.000000 0.100000 2500.000000 1252.630000 10.000000 1.000000 1.945910 1.902108 35.392513 20.160000
50% 15311.00000 27.000000 32625.500000 1.000000 16.990000 6.000000 0.180000 3000.000000 1837.870000 20.000000 1.000000 1.945910 2.889816 42.870386 45.636200
75% 16996.25000 37.000000 39126.250000 2.000000 102.130000 6.500000 0.180000 3500.000000 2425.350000 30.000000 1.000000 2.014903 4.635990 49.247843 137.400000
max 18283.00000 50.000000 48497.000000 900.000000 355.740000 521.360000 0.180000 5000.000000 4556.930000 30.000000 7.000000 6.258357 5.877007 67.505037 8979.275000
In [41]:
from scipy.stats import mode

# Define custom aggregation functions
def first_value(x):
    return x.iloc[0]

def last_value(x):
    return x.iloc[-1]

# Define function to get mode ignoring NaN values
def mode_ignore_nan(x):
    return x.mode().iloc[0] if not x.mode().empty else None


# Group by CustomerId and Aggregate data 
agg_df = df.groupby('CustomerID').agg({
    'Transaction_Date': lambda x: (x.max() - x.min()).days,  
    'Transaction_ID': 'count',  
    'Invoice': 'sum',  
    'Quantity':'sum',
    'Gender': first_value,   
    'Location': first_value, 
    'Tenure_Months': last_value,
    'Product_Category': mode_ignore_nan,
    'Coupon_Status': mode_ignore_nan
})

# Rename the columns
agg_df.columns = ['Number of Days', 'Number of Transaction', 'Total Purchase Amount', 'Quantity', 'Gender', 'Location', 'Tenure', 'Product Category', 'Coupon Status']
agg_df.head()
Out[41]:
Number of Days Number of Transaction Total Purchase Amount Quantity Gender Location Tenure Product Category Coupon Status
CustomerID
12346 0 2 174.98174 3 F New York 31 Apparel Used
12347 223 60 15686.84396 342 M New York 20 Nest-USA Clicked
12348 119 23 1689.55594 209 M California 39 Office Clicked
12350 0 17 1467.43528 21 M California 25 Apparel Clicked
12356 0 36 2007.42900 56 F Chicago 31 Apparel Clicked
In [42]:
agg_df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1468 entries, 12346 to 18283
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Number of Days         1468 non-null   int64  
 1   Number of Transaction  1468 non-null   int64  
 2   Total Purchase Amount  1468 non-null   float64
 3   Quantity               1468 non-null   int64  
 4   Gender                 1468 non-null   object 
 5   Location               1468 non-null   object 
 6   Tenure                 1468 non-null   int64  
 7   Product Category       1468 non-null   object 
 8   Coupon Status          1468 non-null   object 
dtypes: float64(1), int64(4), object(4)
memory usage: 114.7+ KB

Calculate CLV¶

In [43]:
# Average Order Value
agg_df['AOV'] = agg_df['Total Purchase Amount']/agg_df['Number of Transaction']

# Purchase Frequency
purchase_freq = sum(agg_df['Number of Transaction'])/len(agg_df)

# Repeat Rate
repeat_rate = agg_df[agg_df['Number of Transaction']>1].shape[0]/agg_df.shape[0]

# Churn Rate
agg_df['Churn Rate'] = 1 - repeat_rate
In [44]:
# Profit Margin
agg_df['Profit Margin'] = agg_df['Total Purchase Amount'] * 0.15

CLV1 = Customer Value * Average Customer Lifespan¶

where:

  • Customer Value = Average Purchase Value * Average Frequency Rate
  • Average Purchase Value = Total Revenue over a time frame / Total # of purchase over the same time frame
  • Average Frequency Rate = Total # of purchases over a period / Total # of customers during the same period
  • Average Customer Lifespan = Average # of days customers stay active / Total # of customers
In [45]:
# CLV1
agg_df['CLV1'] = (agg_df['AOV']*purchase_freq)*(agg_df['Number of Days'].mean()/len(agg_df))

CLV2 = Average Order Value (AOV) Average Purchase Frequency Rate (AFR) Gross Margin * Churn Rate¶

In [46]:
# CLV2
agg_df['CLV2'] = agg_df['AOV'] * agg_df['Number of Transaction'] * 0.15 * agg_df['Churn Rate']
In [47]:
agg_df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1468 entries, 12346 to 18283
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Number of Days         1468 non-null   int64  
 1   Number of Transaction  1468 non-null   int64  
 2   Total Purchase Amount  1468 non-null   float64
 3   Quantity               1468 non-null   int64  
 4   Gender                 1468 non-null   object 
 5   Location               1468 non-null   object 
 6   Tenure                 1468 non-null   int64  
 7   Product Category       1468 non-null   object 
 8   Coupon Status          1468 non-null   object 
 9   AOV                    1468 non-null   float64
 10  Churn Rate             1468 non-null   float64
 11  Profit Margin          1468 non-null   float64
 12  CLV1                   1468 non-null   float64
 13  CLV2                   1468 non-null   float64
dtypes: float64(6), int64(4), object(4)
memory usage: 172.0+ KB

Data Visualization¶

Location¶

In [48]:
location_counts = df['Location'].value_counts()

plt.figure(figsize=(10, 6))
location_counts.plot(kind='bar')
plt.title('Frequency of Locations')
plt.xlabel('Location')
plt.ylabel('Frequency')
plt.xticks(rotation=45)  # Rotate x-axis labels for readability
plt.show()

Correlation Heatmap¶

In [49]:
# Selecting numeric columns
numeric_columns = df.select_dtypes(include=['number']).columns

# Creating a new dataframe with only numeric columns
newdf = df[numeric_columns]

# Dropping specific columns from the new DataFrame
columns_to_drop = ['CustomerID', 'Transaction_ID', 'Online_Spend', 'Quantity_transformed','Delivery_Charges_transformed','Avg_Price_transformed']
newdf.drop(columns_to_drop, axis=1, inplace=True)

# Creating the correlation heatmap
plt.figure(figsize=(10, 8))
sb.heatmap(newdf.corr(), annot=True, cmap='Blues')
plt.title('Correlation Heatmap')
plt.show()

Customer Count by Each Month¶

In [50]:
df.groupby('Month')['CustomerID'].count()
monthly_customer_count = df.groupby('Month')['CustomerID'].nunique()


plt.figure(figsize=(8, 4))
monthly_customer_count.plot(kind='bar', color = 'lightcoral')
plt.xlabel('Month')
plt.ylabel('Customer Count')
plt.title('Customer Count by Month')
plt.xticks(ticks=range(0, 12), labels=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'], rotation=45)
plt.show()

Customer Retention by Month¶

In [51]:
# Understand the retention of customers on a month-on-month basis
month_dict = {}
for i in df['Month'].unique():
   month_dict[i] = df[df['Month']==i]['CustomerID'].unique().tolist()

months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
retention = [0]
for i in range(11):
  set1 = set(month_dict[months[i]])
  set2 = set(month_dict[months[i+1]])
  common_items = len(set1.intersection(set2))
  retention.append(common_items)
In [52]:
plt.figure(figsize=(8, 3))
plt.bar(months, retention, color='lightcoral')
plt.xlabel('Months')
plt.ylabel('Retention Count')
plt.title('Customer Retention by Month')
plt.xticks(rotation=45)
plt.show()

Product Metrics Analysis¶

Monthly Revenue Growth¶

In [53]:
# Extract Month from transaction date
df['Transaction_Month'] = df['Transaction_Date'].dt.month

# Unique month
df['Transaction_Month'].unique()

# Calculate Revenue for each Month
df_revenue = df.groupby('Transaction_Month')['Invoice'].sum().reset_index()
df_revenue

# Calculate Monthly Revenue Growth 
growth_rate_lst = [0]

for i in range(len(df_revenue)-1):
    growth_rate = (df_revenue['Invoice'][i+1] - df_revenue['Invoice'][i])/df_revenue['Invoice'][i]
    growth_rate_lst.append(growth_rate)
    
df_revenue['growth_rate'] = growth_rate_lst

# Print dataframe
df_revenue
Out[53]:
Transaction_Month Invoice growth_rate
0 1 493977.33055 0.000000
1 2 375052.31962 -0.240750
2 3 414649.53948 0.105578
3 4 477082.59730 0.150568
4 5 365386.18740 -0.234124
5 6 360909.60021 -0.012252
6 7 451645.43540 0.251409
7 8 475670.79902 0.053195
8 9 396481.70738 -0.166479
9 10 480728.77581 0.212487
10 11 547770.52846 0.139459
11 12 556112.29179 0.015229
In [54]:
# Visualize monthly growth
plot1 = [
    go.Scatter(
    x = df_revenue[df_revenue['Transaction_Month'] != '12']['Transaction_Month'],
    y = df_revenue[df_revenue['growth_rate'] != '12']['growth_rate']
    )
]

plot_layout = go.Layout(xaxis = {"type":"category"}, title = 'Monthly Growth Rate')
fig = go.Figure(data = plot1, layout = plot_layout)
pyoff.plot(fig)
Out[54]:
'temp-plot.html'

On average, what is the total number of product sold by each month?¶

In [55]:
df_monthlysales = df.groupby('Transaction_Month')['Quantity'].sum().reset_index()

# Visualize monthly growth
plot2 = [
    go.Scatter(
    x = df_monthlysales['Transaction_Month'],
    y = df_monthlysales['Quantity']
    )
]

plot2_layout = go.Layout(xaxis = {"type":"category"}, title = 'Monthly Total Number of Orders')
fig2 = go.Figure(data = plot2, layout = plot2_layout)
pyoff.plot(fig2)
Out[55]:
'temp-plot.html'

New & Existing Customers¶

In [56]:
# Creating a new dataframe df_first_purchase which consists of the first purchase date for each customer
df_first_purchase = df.groupby('CustomerID').Transaction_Date.min().reset_index()
df_first_purchase.columns = ['CustomerID', 'First Purchase Date']

# Extracting the month from first purchase date
df_first_purchase['First_Purchase_Month'] = df_first_purchase['First Purchase Date'].dt.month

# Merging with the original dataset
df = pd.merge(df, df_first_purchase, on = 'CustomerID')

# Creating new column called UserType to distinguish new and existing customer
df['UserType'] = 'New'
df.loc[df['Transaction_Month'] != df['First_Purchase_Month'], 'UserType'] = 'Existing'

# Count number of new and existing customers
df['UserType'].value_counts()
Out[56]:
New         32033
Existing    20891
Name: UserType, dtype: int64
In [57]:
df_usertype_revenue = df.groupby(['Transaction_Month', 'UserType'])['Invoice'].sum().reset_index()
df_usertype_revenue = df_usertype_revenue.query("Transaction_Month != 1 and Transaction_Month != 12")
In [58]:
df_usertype_revenue
Out[58]:
Transaction_Month UserType Invoice
1 2 Existing 46524.59322
2 2 New 328527.72640
3 3 Existing 68630.05180
4 3 New 346019.48768
5 4 Existing 195895.51546
6 4 New 281187.08184
7 5 Existing 132229.87076
8 5 New 233156.31664
9 6 Existing 143638.73501
10 6 New 217270.86520
11 7 Existing 271483.00300
12 7 New 180162.43240
13 8 Existing 246865.43182
14 8 New 228805.36720
15 9 Existing 234028.56387
16 9 New 162453.14351
17 10 Existing 223293.20299
18 10 New 257435.57282
19 11 Existing 313156.60430
20 11 New 234613.92416
In [59]:
# Visualize
plot3 = [
    go.Scatter(
    x = df_usertype_revenue.query("UserType == 'Existing'")['Transaction_Month'],
    y = df_usertype_revenue.query("UserType == 'Existing'")['Invoice'],
    name = 'Existing'
    ),
    
    go.Scatter(
    x = df_usertype_revenue.query("UserType == 'New'")['Transaction_Month'],
    y = df_usertype_revenue.query("UserType == 'New'")['Invoice'],
    name = 'New'
    )
]

plot3_layout = go.Layout(xaxis = {"type":"category"}, title = 'Revenue of New vs. Existing Customers')
fig3 = go.Figure(data = plot3, layout = plot3_layout)
pyoff.plot(fig3)
Out[59]:
'temp-plot.html'
In [60]:
# Filter out the first and last months (assuming they are incomplete)
df_usertype_revenue_filtered = df_usertype_revenue[(df_usertype_revenue['Transaction_Month'] != 1) & (df_usertype_revenue['Transaction_Month'] != 12)]

# Create separate dataframes for new and existing customers
df_new_customer_revenue = df_usertype_revenue_filtered[df_usertype_revenue_filtered['UserType'] == 'New']
df_existing_customer_revenue = df_usertype_revenue_filtered[df_usertype_revenue_filtered['UserType'] == 'Existing']

# Create a line plot for new and existing customer revenue
plt.figure(figsize=(10, 6))
plt.plot(df_new_customer_revenue['Transaction_Month'], df_new_customer_revenue['Invoice'], marker='o', label='New Customer Revenue')
plt.plot(df_existing_customer_revenue['Transaction_Month'], df_existing_customer_revenue['Invoice'], marker='o', label='Existing Customer Revenue')
plt.title('Revenue by User Type Over Time (Excluding First and Last Months)')
plt.xlabel('Transaction Month')
plt.ylabel('Revenue')
plt.xticks(range(2, 12), ['Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov'])
plt.legend()
plt.grid(True)
plt.show()

New to Existing Customer Ratio¶

In [61]:
# Pecentage of new customer each month
count_new = df.query("UserType=='New'").groupby(['Transaction_Month'])['CustomerID'].nunique()
count_existing = df.query("UserType=='Existing'").groupby(['Transaction_Month'])['CustomerID'].nunique()
ratio_new_to_existing = count_new / count_existing
ratio_new_to_existing
Out[61]:
Transaction_Month
1          NaN
2     7.384615
3     5.709677
4     2.672131
5     1.272727
6     1.122951
7     0.661972
8     0.818182
9     0.678261
10    0.707317
11    0.566667
12    0.815385
Name: CustomerID, dtype: float64
In [62]:
ratio_new_to_existing = ratio_new_to_existing.reset_index()
ratio_new_to_existing = ratio_new_to_existing.dropna()
# Visualize
plot4 = [
    go.Bar(
    x = ratio_new_to_existing['Transaction_Month'],
    y = ratio_new_to_existing['CustomerID']
    )
]

plot4_layout = go.Layout(
xaxis = {"type":"category"},
    title = 'New Customer Ratio'
)

fig4 = go.Figure(data = plot4, layout = plot4_layout)
pyoff.plot(fig4)
Out[62]:
'temp-plot.html'

Monthly Retention Rate¶

In [63]:
# Calculate Revenue by each customer by each month -> assign to a dataframe
df_user_purchase = df.groupby(['CustomerID', 'Transaction_Month'])['Invoice'].sum().astype(int).reset_index()

# Dataframe with dummy variables that show whether customer made a transaction in a specific month
df_retention = pd.crosstab(df_user_purchase['CustomerID'], df_user_purchase['Transaction_Month']).reset_index()

# Define months
months = df_retention.columns[1:]

# Calculate retention
retention_array = []
for i in range(len(months) - 1):
    retention_data = {}
    selected_month = months[i + 1]
    prev_month = months[i]
    retention_data['Transaction_Month'] = selected_month
    retention_data['Total User Count'] = df_retention[selected_month].sum()
    retention_data['Retained User Count'] = df_retention[df_retention[prev_month] > 0][selected_month].sum()
    retention_array.append(retention_data)

# Convert to DataFrame
df_retention = pd.DataFrame(retention_array)

# Calculate Retention Rate
df_retention['Retention Rate'] = df_retention['Retained User Count'] / df_retention['Total User Count']

# Calculate Churn Rate
df_retention['Churn Rate'] = 1 - df_retention['Retention Rate']

print(df_retention)
    Transaction_Month  Total User Count  Retained User Count  Retention Rate  \
0                   2               109                   13        0.119266   
1                   3               208                   11        0.052885   
2                   4               224                   24        0.107143   
3                   5               200                   25        0.125000   
4                   6               259                   37        0.142857   
5                   7               236                   58        0.245763   
6                   8               300                   65        0.216667   
7                   9               193                   44        0.227979   
8                  10               210                   29        0.138095   
9                  11               188                   31        0.164894   
10                 12               236                   28        0.118644   

    Churn Rate  
0     0.880734  
1     0.947115  
2     0.892857  
3     0.875000  
4     0.857143  
5     0.754237  
6     0.783333  
7     0.772021  
8     0.861905  
9     0.835106  
10    0.881356  
In [64]:
import plotly.graph_objects as go

# Create traces for retention rate and churn rate
trace_retention = go.Scatter(
    x=df_retention['Transaction_Month'],
    y=df_retention['Retention Rate'],
    mode='lines+markers',
    name='Retention Rate',
    line=dict(color='navy')
)

trace_churn = go.Scatter(
    x=df_retention['Transaction_Month'],
    y=df_retention['Churn Rate'],
    mode='lines+markers',
    name='Churn Rate',
    line=dict(color='darkorange', dash='dash', width = 5)
)

# Create layout
layout = go.Layout(
    title='<b>Retention and Churn Rates Over Time</b>',  
    xaxis=dict(title='<b>Transaction Month</b>'),  
    yaxis=dict(title='<b>Rate</b>'),
    plot_bgcolor='rgba(0,0,0,0)',
    paper_bgcolor='rgba(0,0,0,0)',
    font=dict(color='black'),
    legend=dict(x=0.02, y=0.6)
)

# Combine traces and layout into a figure
fig = go.Figure(data=[trace_retention, trace_churn], layout=layout)

# Show the plot
fig.show()

K-means Clustering¶

In [65]:
df_user = pd.DataFrame(df['CustomerID'].unique())
df_user.columns = ['CustomerID']

# Calculate Recency
# First, calculate the last transaction date for all customers
df_recency = df.groupby('CustomerID').Transaction_Date.max().reset_index()

# Calculate Recency by subtracting the most recent purchase date of each customer by the most recent date of all customers
df_recency['Recency'] = (df_recency['Transaction_Date'].max() - df_recency['Transaction_Date']).dt.days
df_recency.columns = ['CustomerID', 'Last_Purchase_Date', 'Recency']

# Merge
df_user = pd.merge(df_user, df_recency[['CustomerID', 'Recency']], on = 'CustomerID')

df_user.Recency.describe()
Out[65]:
count    1468.000000
mean      144.292234
std       101.936959
min         0.000000
25%        55.000000
50%       131.000000
75%       220.000000
max       364.000000
Name: Recency, dtype: float64
In [66]:
# Calculate Frequency
# Group by customerID and count frequency of purchase for each customer
df_frequency = df.groupby('CustomerID').Transaction_Date.count().reset_index()

# Rename the columns
df_frequency.columns = ['CustomerID','Frequency']

# Merging with user dataframe
df_user = pd.merge(df_user, df_frequency, on = 'CustomerID')

df_user.Frequency.describe()
Out[66]:
count    1468.000000
mean       36.051771
std        50.885680
min         1.000000
25%        10.000000
50%        21.000000
75%        46.000000
max       695.000000
Name: Frequency, dtype: float64
In [67]:
# Calculate Monetary Value
# Calculate total invoice for each customer
df_monetary = df.groupby('CustomerID').Invoice.sum().reset_index()

# Rename the columns
df_monetary.columns = ['CustomerID','Monetary']

# Merging with user dataframe
df_user = pd.merge(df_user, df_monetary, on = 'CustomerID')

df_user.Monetary.describe()
Out[67]:
count     1468.000000
mean      3675.386316
std       5841.589871
min          6.990000
25%        783.974310
50%       2011.622610
75%       4495.056630
max      86910.893840
Name: Monetary, dtype: float64
In [68]:
print(df_user.columns)
Index(['CustomerID', 'Recency', 'Frequency', 'Monetary'], dtype='object')
In [69]:
df_merge = pd.merge(df_user, agg_df, on = 'CustomerID', how = 'left')
df_cluster = df_merge[['Recency', 'Frequency', 'Monetary', 'CLV1']]


# Plotting box plots for Recency, Frequency, and Monetary
plt.figure(figsize=(10, 6))
sb.boxplot(data=df_cluster)
plt.title('Box Plot of Recency, Frequency, Monetary, and CLV ')
plt.xlabel('Variables')
plt.ylabel('Values')
plt.show()
In [70]:
# Standardize the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df_cluster)
scaled_features = pd.DataFrame(scaled_features)
In [71]:
# Plotting box plots for Scaled Recency, Frequency, and Monetary
plt.figure(figsize=(10, 6))
sb.boxplot(data=scaled_features)
plt.title('Box Plot of Recency, Frequency, Monetary, and CLV')
plt.xlabel('Variables')
plt.ylabel('Values')
plt.show()
In [72]:
# Find the optimal k using elbow curve
sse = {}
df_cluster_copy = df_cluster.copy()  # Create a copy of the DataFrame
for k in range(1, 10):
    kmeans = KMeans(n_clusters=k, max_iter=100).fit(df_cluster_copy)
    df_cluster_copy["clusters"] = kmeans.labels_
    sse[k] = kmeans.inertia_

plt.figure(figsize=(8, 6))
plt.plot(list(sse.keys()), list(sse.values()), marker='o', color='royalblue')
plt.title('Inertia vs. Number of Clusters', fontsize=18)
plt.xlabel('Number of Clusters', fontsize=15)
plt.ylabel('Inertia', fontsize=15)
plt.show()
In [73]:
# Fit the model with 4 clusters
kmeans = KMeans(n_clusters = 3)
kmeans.fit(df_cluster_copy)
df_cluster_copy['Cluster_no'] = kmeans.predict(df_cluster_copy)
df_cluster_copy.groupby('Cluster_no').describe()
Out[73]:
Recency Frequency ... CLV1 clusters
count mean std min 25% 50% 75% max count mean ... 75% max count mean std min 25% 50% 75% max
Cluster_no
0 1227.0 151.683782 101.196469 0.0 65.00 140.0 224.0 364.0 1227.0 21.558272 ... 190.087282 1660.694105 1227.0 3.341483 2.489115 0.0 0.0 4.0 4.0 7.0
1 7.0 71.571429 123.357012 0.0 11.00 16.0 62.0 339.0 7.0 501.285714 ... 223.154023 234.016186 7.0 4.000000 1.914854 2.0 2.0 5.0 5.5 6.0
2 234.0 107.709402 96.731082 1.0 32.25 79.0 159.0 362.0 234.0 98.132479 ... 210.686547 794.758900 234.0 4.303419 2.485544 1.0 3.0 3.0 8.0 8.0

3 rows × 40 columns

In [74]:
df_cluster_copy
Out[74]:
Recency Frequency Monetary CLV1 clusters Cluster_no
0 339 297 40531.38476 215.697333 5 1
1 13 44 3348.77498 120.293752 0 0
2 151 42 3320.84496 124.970956 0 0
3 123 6 967.73992 254.927794 4 0
4 32 58 8071.68837 219.961248 3 2
... ... ... ... ... ... ...
1463 114 5 65.08636 20.574522 4 0
1464 86 34 4535.09298 210.822549 7 0
1465 86 11 1053.68428 151.400600 4 0
1466 86 28 2440.62594 137.769465 0 0
1467 35 94 11252.83782 189.209957 8 2

1468 rows × 6 columns

Visualize Clusters¶

In [75]:
# Define a color palette for clusters
cluster_palette = sb.color_palette('viridis', df_cluster_copy['Cluster_no'].nunique())

# Create a figure and subplots
fig, axs = plt.subplots(2, 2, figsize=(12, 10))

# Plotting Recency vs Frequency
sb.scatterplot(data=df_cluster_copy, x='Recency', y='Frequency', hue='Cluster_no', palette=cluster_palette, ax=axs[0, 0])
axs[0, 0].set_title('Recency vs Frequency')

# Plotting Recency vs Monetary
sb.scatterplot(data=df_cluster_copy, x='Recency', y='Monetary', hue='Cluster_no', palette=cluster_palette, ax=axs[0, 1])
axs[0, 1].set_title('Recency vs Monetary')

# Plotting Frequency vs Monetary
sb.scatterplot(data=df_cluster_copy, x='Frequency', y='Monetary', hue='Cluster_no', palette=cluster_palette, ax=axs[1, 0])
axs[1, 0].set_title('Frequency vs Monetary')

# Add centroids to all plots
for cluster in np.unique(df_cluster_copy['Cluster_no']):
    centroid_recency_frequency = df_cluster_copy[df_cluster_copy['Cluster_no'] == cluster][['Recency', 'Frequency']].mean()
    centroid_recency_monetary = df_cluster_copy[df_cluster_copy['Cluster_no'] == cluster][['Recency', 'Monetary']].mean()
    centroid_frequency_monetary = df_cluster_copy[df_cluster_copy['Cluster_no'] == cluster][['Frequency', 'Monetary']].mean()
    
    axs[0, 0].plot(centroid_recency_frequency['Recency'], centroid_recency_frequency['Frequency'], marker='o', markersize=8, markeredgecolor='black', label=f'Cluster {cluster}', linestyle='', color='red')
    axs[0, 1].plot(centroid_recency_monetary['Recency'], centroid_recency_monetary['Monetary'], marker='o', markersize=8, markeredgecolor='black', label=f'Cluster {cluster}', linestyle='', color='red')
    axs[1, 0].plot(centroid_frequency_monetary['Frequency'], centroid_frequency_monetary['Monetary'], marker='o', markersize=8, markeredgecolor='black', label=f'Cluster {cluster}', linestyle='', color='red')

# Hide empty subplot
axs[1, 1].axis('off')

# Adjust layout
plt.tight_layout()

# Show the plot
plt.show()
In [76]:
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(12, 10))
ax = fig.add_subplot(111, projection='3d')

# Plotting Recency vs Frequency vs Monetary
for cluster, color in zip(np.unique(df_cluster_copy['Cluster_no']), cluster_palette):
    cluster_data = df_cluster_copy[df_cluster_copy['Cluster_no'] == cluster]
    ax.scatter(cluster_data['Recency'], cluster_data['Frequency'], cluster_data['Monetary'], c=color, label=f'Cluster {cluster}')

# Add centroids to the plot
for cluster in np.unique(df_cluster_copy['Cluster_no']):
    centroid = df_cluster_copy[df_cluster_copy['Cluster_no'] == cluster][['Recency', 'Frequency', 'Monetary']].mean()
    ax.scatter(centroid['Recency'], centroid['Frequency'], centroid['Monetary'], marker='o', s=100, c='red', label=f'Cluster {cluster} Centroid')

# Set labels and title
ax.set_xlabel('Recency')
ax.set_ylabel('Frequency')
ax.set_zlabel('Monetary')
ax.set_title('3D Scatter Plot of Clusters')

# Add a legend
ax.legend()

# Show the plot
plt.show()
*c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2D array with a single row if you intend to specify the same RGB or RGBA value for all points.
*c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2D array with a single row if you intend to specify the same RGB or RGBA value for all points.
*c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2D array with a single row if you intend to specify the same RGB or RGBA value for all points.
In [77]:
# Define a color palette for clusters
cluster_palette = sb.color_palette('viridis', df_cluster_copy['Cluster_no'].nunique())

# Create a figure and subplots
fig, axs = plt.subplots(2, 2, figsize=(12, 10))

# Plotting Recency vs CLV1
sb.scatterplot(data=df_cluster_copy, x='Recency', y='CLV1', hue='Cluster_no', palette=cluster_palette, ax=axs[0, 0])
axs[0, 0].set_title('Recency vs CLV1')

# Plotting Frequency vs CLV1
sb.scatterplot(data=df_cluster_copy, x='Frequency', y='CLV1', hue='Cluster_no', palette=cluster_palette, ax=axs[0, 1])
axs[0, 1].set_title('Frequency vs CLV1')

# Plotting Monetary vs CLV1
sb.scatterplot(data=df_cluster_copy, x='Monetary', y='CLV1', hue='Cluster_no', palette=cluster_palette, ax=axs[1, 0])
axs[1, 0].set_title('Monetary vs CLV1')

# Add centroids to plots
for cluster in np.unique(df_cluster_copy['Cluster_no']):
    centroid = df_cluster_copy[df_cluster_copy['Cluster_no'] == cluster][['Recency', 'Frequency', 'Monetary', 'CLV1']].mean()
    axs[0, 0].plot(centroid['Recency'], centroid['CLV1'], marker='o', markersize=8, markeredgecolor='black', label=f'Cluster {cluster}', linestyle='', color='red')
    axs[0, 1].plot(centroid['Frequency'], centroid['CLV1'], marker='o', markersize=8, markeredgecolor='black', label=f'Cluster {cluster}', linestyle='', color='red')
    axs[1, 0].plot(centroid['Monetary'], centroid['CLV1'], marker='o', markersize=8, markeredgecolor='black', label=f'Cluster {cluster}', linestyle='', color='red')

# Hide empty subplot
axs[1, 1].axis('off')

# Adjust layout
plt.tight_layout()

# Show the plot
plt.show()

Profiling¶

In [78]:
# Reset the index of agg_df to turn 'CustomerID' into a column
agg_df = agg_df.reset_index()

# Consider 'CLV1' is a unique key in both DataFrames, we then combine grouped_profile and cluster dataset df_cluster_copy
merged_profile = pd.merge(agg_df[['CustomerID','Gender', 'Location', 'Quantity', 'Tenure', 'AOV', 'CLV1', 'Product Category', 'Coupon Status']], df_cluster_copy.drop(columns='clusters'), on='CLV1', how='left')
In [79]:
merged_profile
Out[79]:
CustomerID Gender Location Quantity Tenure AOV CLV1 Product Category Coupon Status Recency Frequency Monetary Cluster_no
0 12346 F New York 3 31 87.490870 138.284186 Apparel Used 107 2 174.98174 0
1 12347 M New York 342 20 261.447399 413.232155 Nest-USA Clicked 59 60 15686.84396 2
2 12348 M California 209 39 73.458954 116.105962 Office Clicked 73 23 1689.55594 0
3 12350 M California 21 25 86.319722 136.433122 Apparel Clicked 17 17 1467.43528 0
4 12356 F Chicago 56 31 55.761917 88.134810 Apparel Clicked 107 36 2007.42900 0
... ... ... ... ... ... ... ... ... ... ... ... ... ...
1469 18260 M New York 141 43 75.413593 119.195378 Apparel Clicked 87 40 3016.54372 0
1470 18269 M Chicago 10 25 20.863437 32.975823 Apparel Clicked 194 8 166.90750 0
1471 18277 F Chicago 2 47 333.800000 527.589464 Nest-USA Not Used 69 1 333.80000 0
1472 18277 F Chicago 2 47 333.800000 527.589464 Nest-USA Not Used 223 1 333.80000 0
1473 18283 F New Jersey 155 36 72.125994 113.999145 Apparel Clicked 82 102 7356.85141 2

1474 rows × 13 columns

In [80]:
# Break up of the number of observations in each cluster
cluster_counts = merged_profile.Cluster_no.value_counts()
cluster_counts
Out[80]:
0    1233
2     234
1       7
Name: Cluster_no, dtype: int64
In [81]:
# Segment Distribtuion % wise:
cluster_percentages = (cluster_counts / merged_profile['Cluster_no'].size) * 100
cluster_percentages
Out[81]:
0    83.649932
2    15.875170
1     0.474898
Name: Cluster_no, dtype: float64
In [82]:
# For each cluster, calculate statistics for each attribute to profile the clusters.
# For numerical attributes, use mean.
# For categorical attributes, count the frequency of each category.

# Set seed for reproducibility
np.random.seed(3)

# Profiling for numerical attributes
numerical_profile = merged_profile.groupby('Cluster_no').agg({
    'Quantity': 'mean',  
    'Tenure': 'mean',  
    'AOV': 'mean',
    'Recency': 'mean',
    'Frequency': 'mean',
    'Monetary': 'mean',
    'CLV1':'mean'
})

# Profiling for categorical attributes
categorical_profile = merged_profile.groupby('Cluster_no').agg({
    'Location': lambda x: x.mode()[0], 
    'Gender': lambda x: x.mode()[0],
    'Product Category': lambda x: x.mode()[0],
    'Coupon Status': lambda x: x.mode()[0]
})

# Combine the numerical and categorical profiles into one DataFrame
complete_profile = numerical_profile.join(categorical_profile).reset_index()

# Displaying the complete profile for each cluster
print(complete_profile)
   Cluster_no     Quantity     Tenure         AOV     Recency   Frequency  \
0           0    86.865369  25.845904   96.998251  151.868613   21.458232   
1           1  2806.571429  30.285714  125.516719   71.571429  501.285714   
2           2   475.606838  26.145299  123.204172  107.709402   98.132479   

       Monetary        CLV1    Location Gender Product Category Coupon Status  
0   1971.506761  153.311130  California      F          Apparel       Clicked  
1  61713.066070  198.386155     Chicago      F          Apparel       Clicked  
2  10828.387101  194.731047  California      F          Apparel       Clicked  
In [83]:
merged_profile
Out[83]:
CustomerID Gender Location Quantity Tenure AOV CLV1 Product Category Coupon Status Recency Frequency Monetary Cluster_no
0 12346 F New York 3 31 87.490870 138.284186 Apparel Used 107 2 174.98174 0
1 12347 M New York 342 20 261.447399 413.232155 Nest-USA Clicked 59 60 15686.84396 2
2 12348 M California 209 39 73.458954 116.105962 Office Clicked 73 23 1689.55594 0
3 12350 M California 21 25 86.319722 136.433122 Apparel Clicked 17 17 1467.43528 0
4 12356 F Chicago 56 31 55.761917 88.134810 Apparel Clicked 107 36 2007.42900 0
... ... ... ... ... ... ... ... ... ... ... ... ... ...
1469 18260 M New York 141 43 75.413593 119.195378 Apparel Clicked 87 40 3016.54372 0
1470 18269 M Chicago 10 25 20.863437 32.975823 Apparel Clicked 194 8 166.90750 0
1471 18277 F Chicago 2 47 333.800000 527.589464 Nest-USA Not Used 69 1 333.80000 0
1472 18277 F Chicago 2 47 333.800000 527.589464 Nest-USA Not Used 223 1 333.80000 0
1473 18283 F New Jersey 155 36 72.125994 113.999145 Apparel Clicked 82 102 7356.85141 2

1474 rows × 13 columns

Cluster 0 - "Gold - Potential Loyalists"¶

  • Average customer relationship length
  • Recent interaction
  • High AOV
  • Second highest frequency and monetary value
  • Location: California
  • Gender: predominantly female

Customers here perhaps making fewer purchases than cluster 2. However, their purchase value per visit us high just as much as premium customers. And even though their last interaction isn't the most recent, they have the high potential future value.

Cluster 1 - "Standard - New and Occasional"¶

  • Average customer relationship length
  • Least recent
  • Lowest AOV, frequency, monetary value and CLV
  • Location: California
  • Gender: predominantly female

This cluster is the least engaged with the brand. They shop infrequently, spend the least amount of money, and have not made purchases recently, hinting at a risk of churn. The low AOV indicates that customers in this group are purchasing less expensive items or smaller quantities. Their CLV suggests limited long-term profitability.

Cluster 2 - "Premium - Loyal High Spenders"¶

  • Longest customer relationship length
  • Most recent interaction
  • Second highest AOV and CLV
  • Highest frequency and monetary value
  • Location: Chicago
  • Gender: predominantly female

This is the most loyal segment with customers that spend a lot and shop most frequently, have recently interacted with the brand, and have a long-standing relationship, thus considered 'Premium'.

CLV Prediction¶

In [84]:
# Define a dependent variable 
# Define labels based on cluster characteristics
def label_clusters(cluster):
    if cluster == 1:
        return 'Premium'
    elif cluster == 2:
        return 'Gold'
    else:
        return 'Silver'

# Apply the label_clusters function to assign labels
merged_profile['Segment_Label'] = merged_profile['Cluster_no'].apply(label_clusters)
In [85]:
merged_profile.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1474 entries, 0 to 1473
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   CustomerID        1474 non-null   int64  
 1   Gender            1474 non-null   object 
 2   Location          1474 non-null   object 
 3   Quantity          1474 non-null   int64  
 4   Tenure            1474 non-null   int64  
 5   AOV               1474 non-null   float64
 6   CLV1              1474 non-null   float64
 7   Product Category  1474 non-null   object 
 8   Coupon Status     1474 non-null   object 
 9   Recency           1474 non-null   int64  
 10  Frequency         1474 non-null   int64  
 11  Monetary          1474 non-null   float64
 12  Cluster_no        1474 non-null   int32  
 13  Segment_Label     1474 non-null   object 
dtypes: float64(3), int32(1), int64(5), object(5)
memory usage: 167.0+ KB

Logistic Regression¶

In [86]:
# Split the data into features and target
feature_cols = ['Quantity', 'Gender','Location','Tenure', 'AOV']
X = merged_profile[feature_cols] 
X = pd.get_dummies(X, columns=['Gender', 'Location'])
y = merged_profile['Segment_Label']  

# Scale the features 
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Oversampling the training set
from imblearn.over_sampling import SMOTE
#smote = SMOTE(k_neighbors=3, random_state=42)

# Resample the data
#X_resampled, y_resampled = smote.fit_resample(X, y)

from imblearn.combine import SMOTETomek
# SMOTE-Tomek
smote_tomek = SMOTETomek(sampling_strategy='auto', random_state=42, smote=SMOTE(k_neighbors=3))
# Resample the data
X_resampled, y_resampled = smote_tomek.fit_resample(X, y)

# Check the class after oversampling
unique, counts = np.unique(y_resampled, return_counts=True)
print(dict(zip(unique, counts)))
{'Gold': 1211, 'Premium': 1233, 'Silver': 1211}
In [87]:
# Splitting the dataset into training (70%) and testing (30%)
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

# Create the model, using the 'lbfgs' solver for multinomial logistic regression
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)

# Fit the model to the training data
y_score = model.fit(X_train, y_train).predict_proba(X_test)

# Predict the labels for the test set
y_pred = model.predict(X_test)

# Calculate and print the accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
Accuracy: 0.89
In [88]:
y_pred_train = model.predict(X_train)
# Accuracy of training set
accuracy_train = accuracy_score(y_train, y_pred_train)
print(f'Accuracy: {accuracy_train:.2f}')
Accuracy: 0.90
In [89]:
from sklearn.model_selection import learning_curve

train_sizes, train_scores, validation_scores = learning_curve(
    estimator = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000),
    X = X, y = y, train_sizes = np.linspace(0.01, 1.0, 50), cv = 5
)

# Mean and standard deviation of training and validation scores
train_scores_mean = np.mean(train_scores, axis=1)
validation_scores_mean = np.mean(validation_scores, axis=1)

# Plot learning curves
plt.figure()
plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
plt.plot(train_sizes, validation_scores_mean, 'o-', color="g", label="Cross-validation score")
plt.title("Learning Curve")
plt.xlabel("Training Set Size")
plt.ylabel("Accuracy Score")
plt.legend(loc="best")
plt.show()
In [90]:
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import ConfusionMatrixDisplay

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)
ConfusionMatrixDisplay(confusion_matrix=conf_matrix).plot()

# Classification report
classif_report = classification_report(y_test, y_pred)
print("\nClassification Report:")
print(classif_report)
Confusion Matrix:
[[291  11  57]
 [  0 348   0]
 [ 56   0 334]]

Classification Report:
              precision    recall  f1-score   support

        Gold       0.84      0.81      0.82       359
     Premium       0.97      1.00      0.98       348
      Silver       0.85      0.86      0.86       390

    accuracy                           0.89      1097
   macro avg       0.89      0.89      0.89      1097
weighted avg       0.89      0.89      0.89      1097

Classification Tree¶

In [91]:
# Create Decision Tree classifer object
dt = DecisionTreeClassifier()

# Train Decision Tree Classifer
dt = dt.fit(X_train,y_train)

#Predict the response for test dataset
y_pred_dt = dt.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_dt))
Accuracy: 0.9197812215132178

Random Forest¶

In [92]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the classifier to the training data
rf.fit(X_train, y_train)

# Predict on the test data
y_pred = rf.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classif_report = classification_report(y_test, y_pred)

# Print the results
print(f"Random Forest Model Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classif_report)
Random Forest Model Accuracy: 0.95
Confusion Matrix:
[[354   0   5]
 [  2 346   0]
 [ 51   0 339]]
Classification Report:
              precision    recall  f1-score   support

        Gold       0.87      0.99      0.92       359
     Premium       1.00      0.99      1.00       348
      Silver       0.99      0.87      0.92       390

    accuracy                           0.95      1097
   macro avg       0.95      0.95      0.95      1097
weighted avg       0.95      0.95      0.95      1097

XGBoost¶

In [93]:
# Encode string class labels to integers
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Fit the XGBoost model using the encoded labels
xgb_classifier = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_classifier.fit(X_train, y_train_encoded)

# Predict on the test data
y_pred_encoded = xgb_classifier.predict(X_test)
y_pred = label_encoder.inverse_transform(y_pred_encoded) # Convert predictions back to original labels

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classif_report = classification_report(y_test, y_pred)

# Print the results
print(f"XGBoost Model Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classif_report)
XGBoost Model Accuracy: 0.94
Confusion Matrix:
[[344   2  13]
 [  2 346   0]
 [ 49   0 341]]
Classification Report:
              precision    recall  f1-score   support

        Gold       0.87      0.96      0.91       359
     Premium       0.99      0.99      0.99       348
      Silver       0.96      0.87      0.92       390

    accuracy                           0.94      1097
   macro avg       0.94      0.94      0.94      1097
weighted avg       0.94      0.94      0.94      1097

Market Basket Analysis¶

In [94]:
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

basket = df.groupby(['Transaction_ID', 'Product_Category'])['Quantity'].sum().unstack().fillna(0)
basket = basket > 0  # This converts data directly to boolean

# Apply Apriori algorithm to find frequent itemsets
frequent_itemsets = apriori(basket, min_support=0.03, use_colnames=True)

# Extract association rules
association_rules_df = association_rules(frequent_itemsets, metric='lift', min_threshold=0.5)

# Interpret the association rules and identify products to bundle

# For example, to identify items that are frequently purchased together:
frequent_itemsets['itemsets'].apply(lambda x: list(x))
Out[94]:
0                [Apparel]
1                   [Bags]
2              [Drinkware]
3              [Lifestyle]
4                   [Nest]
5               [Nest-USA]
6                 [Office]
7     [Drinkware, Apparel]
8     [Lifestyle, Apparel]
9        [Apparel, Office]
10     [Drinkware, Office]
11     [Lifestyle, Office]
Name: itemsets, dtype: object
In [95]:
association_rules_df
Out[95]:
antecedents consequents antecedent support consequent support support confidence lift leverage conviction zhangs_metric
0 (Drinkware) (Apparel) 0.100714 0.324369 0.045010 0.446910 1.377784 0.012342 1.221557 0.304905
1 (Apparel) (Drinkware) 0.324369 0.100714 0.045010 0.138762 1.377784 0.012342 1.044179 0.405838
2 (Lifestyle) (Apparel) 0.068313 0.324369 0.033079 0.484229 1.492836 0.010921 1.309945 0.354340
3 (Apparel) (Lifestyle) 0.324369 0.068313 0.033079 0.101981 1.492836 0.010921 1.037491 0.488630
4 (Apparel) (Office) 0.324369 0.140697 0.062128 0.191536 1.361343 0.016491 1.062884 0.392864
5 (Office) (Apparel) 0.140697 0.324369 0.062128 0.441577 1.361343 0.016491 1.209892 0.308891
6 (Drinkware) (Office) 0.100714 0.140697 0.046287 0.459588 3.266516 0.032117 1.590089 0.771572
7 (Office) (Drinkware) 0.140697 0.100714 0.046287 0.328985 3.266516 0.032117 1.340187 0.807472
8 (Lifestyle) (Office) 0.068313 0.140697 0.035114 0.514019 3.653381 0.025503 1.768182 0.779533
9 (Office) (Lifestyle) 0.140697 0.068313 0.035114 0.249575 3.653381 0.025503 1.241545 0.845197

Personalized Marketing¶

In [96]:
# Example mapping
product_transaction_mapping = df.reset_index().groupby('Product_Category')['Transaction_ID'].unique()

# Define function to find transactions for complex itemsets
def find_transactions_for_itemset(itemset):
    itemset = list(itemset) 
    if not itemset:
        return []
    possible_transactions = set(product_transaction_mapping[itemset[0]])
    for item in itemset[1:]:
        if item in product_transaction_mapping:
            possible_transactions.intersection_update(product_transaction_mapping[item])
        else:
            return []  
    return list(possible_transactions)

# Calculate the transactions associated with each antecedent itemset in the DataFrame and stores this information in a new column called 'Transactions'
association_rules_df['Transactions'] = association_rules_df['antecedents'].apply(find_transactions_for_itemset)

# Define function to find CustomerIDs for these transactions
def find_customers_for_transactions(transactions):
    return df[df['Transaction_ID'].isin(transactions)]['CustomerID'].unique()

association_rules_df['CustomerIDs'] = association_rules_df['Transactions'].apply(find_customers_for_transactions)

# Include full itemset details when merging with customer profiles
def merge_with_profile(customer_ids, itemset):
    profiles = merged_profile[merged_profile['CustomerID'].isin(customer_ids)]
    profiles['Itemset'] = [', '.join(itemset)] * len(profiles)  # Convert itemset to a comma-separated string
    return profiles

# Usage example to get profiles for a specific rule, using both antecedents and consequents
for index, row in association_rules_df.iterrows():
    itemset_example = list(row['antecedents']) + list(row['consequents'])  # Combine both antecedents and consequents
    profiles_linked_to_rule = merge_with_profile(row['CustomerIDs'], itemset_example)
    print(f"Profiles linked to itemset {itemset_example}:")
    print(profiles_linked_to_rule)
Profiles linked to itemset ['Drinkware', 'Apparel']:
      CustomerID Gender       Location  Quantity  Tenure         AOV  \
1          12347      M       New York       342      20  261.447399   
2          12348      M     California       209      39   73.458954   
4          12356      F        Chicago        56      31   55.761917   
6          12370      F       New York       613      21   88.515242   
7          12373      F       New York        22      23   42.540433   
...          ...    ...            ...       ...     ...         ...   
1464       18239      M       New York       140      20  133.231426   
1465       18245      F  Washington DC        86       5  153.096153   
1468       18259      F     California        46       5  122.955143   
1469       18260      M       New York       141      43   75.413593   
1473       18283      F     New Jersey       155      36   72.125994   

            CLV1 Product Category Coupon Status  Recency  Frequency  \
1     413.232155         Nest-USA       Clicked       59         60   
2     116.105962           Office       Clicked       73         23   
4      88.134810          Apparel       Clicked      107         36   
6     139.903263          Apparel       Clicked      189         91   
7      67.237520          Apparel       Clicked       24         14   
...          ...              ...           ...      ...        ...   
1464  210.579679          Apparel       Clicked       34         52   
1465  241.976985         Nest-USA       Clicked       33         55   
1468  194.337441          Apparel       Clicked      270          7   
1469  119.195378          Apparel       Clicked       87         40   
1473  113.999145          Apparel       Clicked       82        102   

         Monetary  Cluster_no Segment_Label             Itemset  
1     15686.84396           2          Gold  Drinkware, Apparel  
2      1689.55594           0        Silver  Drinkware, Apparel  
4      2007.42900           0        Silver  Drinkware, Apparel  
6      8054.88702           2          Gold  Drinkware, Apparel  
7       595.56606           0        Silver  Drinkware, Apparel  
...           ...         ...           ...                 ...  
1464   6928.03416           2          Gold  Drinkware, Apparel  
1465   8420.28844           2          Gold  Drinkware, Apparel  
1468    860.68600           0        Silver  Drinkware, Apparel  
1469   3016.54372           0        Silver  Drinkware, Apparel  
1473   7356.85141           2          Gold  Drinkware, Apparel  

[1031 rows x 15 columns]
Profiles linked to itemset ['Apparel', 'Drinkware']:
      CustomerID Gender    Location  Quantity  Tenure         AOV        CLV1  \
0          12346      F    New York         3      31   87.490870  138.284186   
1          12347      M    New York       342      20  261.447399  413.232155   
2          12348      M  California       209      39   73.458954  116.105962   
3          12350      M  California        21      25   86.319722  136.433122   
4          12356      F     Chicago        56      31   55.761917   88.134810   
...          ...    ...         ...       ...     ...         ...         ...   
1467       18256      M  California         4      45   20.838400   32.936250   
1468       18259      F  California        46       5  122.955143  194.337441   
1469       18260      M    New York       141      43   75.413593  119.195378   
1470       18269      M     Chicago        10      25   20.863437   32.975823   
1473       18283      F  New Jersey       155      36   72.125994  113.999145   

     Product Category Coupon Status  Recency  Frequency     Monetary  \
0             Apparel          Used      107          2    174.98174   
1            Nest-USA       Clicked       59         60  15686.84396   
2              Office       Clicked       73         23   1689.55594   
3             Apparel       Clicked       17         17   1467.43528   
4             Apparel       Clicked      107         36   2007.42900   
...               ...           ...      ...        ...          ...   
1467          Apparel       Clicked      179          4     83.35360   
1468          Apparel       Clicked      270          7    860.68600   
1469          Apparel       Clicked       87         40   3016.54372   
1470          Apparel       Clicked      194          8    166.90750   
1473          Apparel       Clicked       82        102   7356.85141   

      Cluster_no Segment_Label             Itemset  
0              0        Silver  Apparel, Drinkware  
1              2          Gold  Apparel, Drinkware  
2              0        Silver  Apparel, Drinkware  
3              0        Silver  Apparel, Drinkware  
4              0        Silver  Apparel, Drinkware  
...          ...           ...                 ...  
1467           0        Silver  Apparel, Drinkware  
1468           0        Silver  Apparel, Drinkware  
1469           0        Silver  Apparel, Drinkware  
1470           0        Silver  Apparel, Drinkware  
1473           2          Gold  Apparel, Drinkware  

[1367 rows x 15 columns]
Profiles linked to itemset ['Lifestyle', 'Apparel']:
      CustomerID Gender    Location  Quantity  Tenure         AOV        CLV1  \
1          12347      M    New York       342      20  261.447399  413.232155   
2          12348      M  California       209      39   73.458954  116.105962   
4          12356      F     Chicago        56      31   55.761917   88.134810   
6          12370      F    New York       613      21   88.515242  139.903263   
8          12377      F  California       421      27  146.037671  230.820661   
...          ...    ...         ...       ...     ...         ...         ...   
1463       18233      F  New Jersey        50       2   82.615000  130.577602   
1464       18239      M    New York       140      20  133.231426  210.579679   
1468       18259      F  California        46       5  122.955143  194.337441   
1469       18260      M    New York       141      43   75.413593  119.195378   
1473       18283      F  New Jersey       155      36   72.125994  113.999145   

     Product Category Coupon Status  Recency  Frequency     Monetary  \
1            Nest-USA       Clicked       59         60  15686.84396   
2              Office       Clicked       73         23   1689.55594   
4             Apparel       Clicked      107         36   2007.42900   
6             Apparel       Clicked      189         91   8054.88702   
8             Apparel       Clicked       40         77  11244.90066   
...               ...           ...      ...        ...          ...   
1463           Office          Used      108          1     82.61500   
1464          Apparel       Clicked       34         52   6928.03416   
1468          Apparel       Clicked      270          7    860.68600   
1469          Apparel       Clicked       87         40   3016.54372   
1473          Apparel       Clicked       82        102   7356.85141   

      Cluster_no Segment_Label             Itemset  
1              2          Gold  Lifestyle, Apparel  
2              0        Silver  Lifestyle, Apparel  
4              0        Silver  Lifestyle, Apparel  
6              2          Gold  Lifestyle, Apparel  
8              2          Gold  Lifestyle, Apparel  
...          ...           ...                 ...  
1463           0        Silver  Lifestyle, Apparel  
1464           2          Gold  Lifestyle, Apparel  
1468           0        Silver  Lifestyle, Apparel  
1469           0        Silver  Lifestyle, Apparel  
1473           2          Gold  Lifestyle, Apparel  

[932 rows x 15 columns]
Profiles linked to itemset ['Apparel', 'Lifestyle']:
      CustomerID Gender    Location  Quantity  Tenure         AOV        CLV1  \
0          12346      F    New York         3      31   87.490870  138.284186   
1          12347      M    New York       342      20  261.447399  413.232155   
2          12348      M  California       209      39   73.458954  116.105962   
3          12350      M  California        21      25   86.319722  136.433122   
4          12356      F     Chicago        56      31   55.761917   88.134810   
...          ...    ...         ...       ...     ...         ...         ...   
1467       18256      M  California         4      45   20.838400   32.936250   
1468       18259      F  California        46       5  122.955143  194.337441   
1469       18260      M    New York       141      43   75.413593  119.195378   
1470       18269      M     Chicago        10      25   20.863437   32.975823   
1473       18283      F  New Jersey       155      36   72.125994  113.999145   

     Product Category Coupon Status  Recency  Frequency     Monetary  \
0             Apparel          Used      107          2    174.98174   
1            Nest-USA       Clicked       59         60  15686.84396   
2              Office       Clicked       73         23   1689.55594   
3             Apparel       Clicked       17         17   1467.43528   
4             Apparel       Clicked      107         36   2007.42900   
...               ...           ...      ...        ...          ...   
1467          Apparel       Clicked      179          4     83.35360   
1468          Apparel       Clicked      270          7    860.68600   
1469          Apparel       Clicked       87         40   3016.54372   
1470          Apparel       Clicked      194          8    166.90750   
1473          Apparel       Clicked       82        102   7356.85141   

      Cluster_no Segment_Label             Itemset  
0              0        Silver  Apparel, Lifestyle  
1              2          Gold  Apparel, Lifestyle  
2              0        Silver  Apparel, Lifestyle  
3              0        Silver  Apparel, Lifestyle  
4              0        Silver  Apparel, Lifestyle  
...          ...           ...                 ...  
1467           0        Silver  Apparel, Lifestyle  
1468           0        Silver  Apparel, Lifestyle  
1469           0        Silver  Apparel, Lifestyle  
1470           0        Silver  Apparel, Lifestyle  
1473           2          Gold  Apparel, Lifestyle  

[1367 rows x 15 columns]
Profiles linked to itemset ['Apparel', 'Office']:
      CustomerID Gender    Location  Quantity  Tenure         AOV        CLV1  \
0          12346      F    New York         3      31   87.490870  138.284186   
1          12347      M    New York       342      20  261.447399  413.232155   
2          12348      M  California       209      39   73.458954  116.105962   
3          12350      M  California        21      25   86.319722  136.433122   
4          12356      F     Chicago        56      31   55.761917   88.134810   
...          ...    ...         ...       ...     ...         ...         ...   
1467       18256      M  California         4      45   20.838400   32.936250   
1468       18259      F  California        46       5  122.955143  194.337441   
1469       18260      M    New York       141      43   75.413593  119.195378   
1470       18269      M     Chicago        10      25   20.863437   32.975823   
1473       18283      F  New Jersey       155      36   72.125994  113.999145   

     Product Category Coupon Status  Recency  Frequency     Monetary  \
0             Apparel          Used      107          2    174.98174   
1            Nest-USA       Clicked       59         60  15686.84396   
2              Office       Clicked       73         23   1689.55594   
3             Apparel       Clicked       17         17   1467.43528   
4             Apparel       Clicked      107         36   2007.42900   
...               ...           ...      ...        ...          ...   
1467          Apparel       Clicked      179          4     83.35360   
1468          Apparel       Clicked      270          7    860.68600   
1469          Apparel       Clicked       87         40   3016.54372   
1470          Apparel       Clicked      194          8    166.90750   
1473          Apparel       Clicked       82        102   7356.85141   

      Cluster_no Segment_Label          Itemset  
0              0        Silver  Apparel, Office  
1              2          Gold  Apparel, Office  
2              0        Silver  Apparel, Office  
3              0        Silver  Apparel, Office  
4              0        Silver  Apparel, Office  
...          ...           ...              ...  
1467           0        Silver  Apparel, Office  
1468           0        Silver  Apparel, Office  
1469           0        Silver  Apparel, Office  
1470           0        Silver  Apparel, Office  
1473           2          Gold  Apparel, Office  

[1367 rows x 15 columns]
Profiles linked to itemset ['Office', 'Apparel']:
      CustomerID Gender    Location  Quantity  Tenure         AOV        CLV1  \
0          12346      F    New York         3      31   87.490870  138.284186   
1          12347      M    New York       342      20  261.447399  413.232155   
2          12348      M  California       209      39   73.458954  116.105962   
4          12356      F     Chicago        56      31   55.761917   88.134810   
5          12359      M    New York        86      41   49.002075   77.450505   
...          ...    ...         ...       ...     ...         ...         ...   
1469       18260      M    New York       141      43   75.413593  119.195378   
1470       18269      M     Chicago        10      25   20.863437   32.975823   
1471       18277      F     Chicago         2      47  333.800000  527.589464   
1472       18277      F     Chicago         2      47  333.800000  527.589464   
1473       18283      F  New Jersey       155      36   72.125994  113.999145   

     Product Category Coupon Status  Recency  Frequency     Monetary  \
0             Apparel          Used      107          2    174.98174   
1            Nest-USA       Clicked       59         60  15686.84396   
2              Office       Clicked       73         23   1689.55594   
4             Apparel       Clicked      107         36   2007.42900   
5             Apparel       Clicked      130         16    784.03320   
...               ...           ...      ...        ...          ...   
1469          Apparel       Clicked       87         40   3016.54372   
1470          Apparel       Clicked      194          8    166.90750   
1471         Nest-USA      Not Used       69          1    333.80000   
1472         Nest-USA      Not Used      223          1    333.80000   
1473          Apparel       Clicked       82        102   7356.85141   

      Cluster_no Segment_Label          Itemset  
0              0        Silver  Office, Apparel  
1              2          Gold  Office, Apparel  
2              0        Silver  Office, Apparel  
4              0        Silver  Office, Apparel  
5              0        Silver  Office, Apparel  
...          ...           ...              ...  
1469           0        Silver  Office, Apparel  
1470           0        Silver  Office, Apparel  
1471           0        Silver  Office, Apparel  
1472           0        Silver  Office, Apparel  
1473           2          Gold  Office, Apparel  

[1203 rows x 15 columns]
Profiles linked to itemset ['Drinkware', 'Office']:
      CustomerID Gender       Location  Quantity  Tenure         AOV  \
1          12347      M       New York       342      20  261.447399   
2          12348      M     California       209      39   73.458954   
4          12356      F        Chicago        56      31   55.761917   
6          12370      F       New York       613      21   88.515242   
7          12373      F       New York        22      23   42.540433   
...          ...    ...            ...       ...     ...         ...   
1464       18239      M       New York       140      20  133.231426   
1465       18245      F  Washington DC        86       5  153.096153   
1468       18259      F     California        46       5  122.955143   
1469       18260      M       New York       141      43   75.413593   
1473       18283      F     New Jersey       155      36   72.125994   

            CLV1 Product Category Coupon Status  Recency  Frequency  \
1     413.232155         Nest-USA       Clicked       59         60   
2     116.105962           Office       Clicked       73         23   
4      88.134810          Apparel       Clicked      107         36   
6     139.903263          Apparel       Clicked      189         91   
7      67.237520          Apparel       Clicked       24         14   
...          ...              ...           ...      ...        ...   
1464  210.579679          Apparel       Clicked       34         52   
1465  241.976985         Nest-USA       Clicked       33         55   
1468  194.337441          Apparel       Clicked      270          7   
1469  119.195378          Apparel       Clicked       87         40   
1473  113.999145          Apparel       Clicked       82        102   

         Monetary  Cluster_no Segment_Label            Itemset  
1     15686.84396           2          Gold  Drinkware, Office  
2      1689.55594           0        Silver  Drinkware, Office  
4      2007.42900           0        Silver  Drinkware, Office  
6      8054.88702           2          Gold  Drinkware, Office  
7       595.56606           0        Silver  Drinkware, Office  
...           ...         ...           ...                ...  
1464   6928.03416           2          Gold  Drinkware, Office  
1465   8420.28844           2          Gold  Drinkware, Office  
1468    860.68600           0        Silver  Drinkware, Office  
1469   3016.54372           0        Silver  Drinkware, Office  
1473   7356.85141           2          Gold  Drinkware, Office  

[1031 rows x 15 columns]
Profiles linked to itemset ['Office', 'Drinkware']:
      CustomerID Gender    Location  Quantity  Tenure         AOV        CLV1  \
0          12346      F    New York         3      31   87.490870  138.284186   
1          12347      M    New York       342      20  261.447399  413.232155   
2          12348      M  California       209      39   73.458954  116.105962   
4          12356      F     Chicago        56      31   55.761917   88.134810   
5          12359      M    New York        86      41   49.002075   77.450505   
...          ...    ...         ...       ...     ...         ...         ...   
1469       18260      M    New York       141      43   75.413593  119.195378   
1470       18269      M     Chicago        10      25   20.863437   32.975823   
1471       18277      F     Chicago         2      47  333.800000  527.589464   
1472       18277      F     Chicago         2      47  333.800000  527.589464   
1473       18283      F  New Jersey       155      36   72.125994  113.999145   

     Product Category Coupon Status  Recency  Frequency     Monetary  \
0             Apparel          Used      107          2    174.98174   
1            Nest-USA       Clicked       59         60  15686.84396   
2              Office       Clicked       73         23   1689.55594   
4             Apparel       Clicked      107         36   2007.42900   
5             Apparel       Clicked      130         16    784.03320   
...               ...           ...      ...        ...          ...   
1469          Apparel       Clicked       87         40   3016.54372   
1470          Apparel       Clicked      194          8    166.90750   
1471         Nest-USA      Not Used       69          1    333.80000   
1472         Nest-USA      Not Used      223          1    333.80000   
1473          Apparel       Clicked       82        102   7356.85141   

      Cluster_no Segment_Label            Itemset  
0              0        Silver  Office, Drinkware  
1              2          Gold  Office, Drinkware  
2              0        Silver  Office, Drinkware  
4              0        Silver  Office, Drinkware  
5              0        Silver  Office, Drinkware  
...          ...           ...                ...  
1469           0        Silver  Office, Drinkware  
1470           0        Silver  Office, Drinkware  
1471           0        Silver  Office, Drinkware  
1472           0        Silver  Office, Drinkware  
1473           2          Gold  Office, Drinkware  

[1203 rows x 15 columns]
Profiles linked to itemset ['Lifestyle', 'Office']:
      CustomerID Gender    Location  Quantity  Tenure         AOV        CLV1  \
1          12347      M    New York       342      20  261.447399  413.232155   
2          12348      M  California       209      39   73.458954  116.105962   
4          12356      F     Chicago        56      31   55.761917   88.134810   
6          12370      F    New York       613      21   88.515242  139.903263   
8          12377      F  California       421      27  146.037671  230.820661   
...          ...    ...         ...       ...     ...         ...         ...   
1463       18233      F  New Jersey        50       2   82.615000  130.577602   
1464       18239      M    New York       140      20  133.231426  210.579679   
1468       18259      F  California        46       5  122.955143  194.337441   
1469       18260      M    New York       141      43   75.413593  119.195378   
1473       18283      F  New Jersey       155      36   72.125994  113.999145   

     Product Category Coupon Status  Recency  Frequency     Monetary  \
1            Nest-USA       Clicked       59         60  15686.84396   
2              Office       Clicked       73         23   1689.55594   
4             Apparel       Clicked      107         36   2007.42900   
6             Apparel       Clicked      189         91   8054.88702   
8             Apparel       Clicked       40         77  11244.90066   
...               ...           ...      ...        ...          ...   
1463           Office          Used      108          1     82.61500   
1464          Apparel       Clicked       34         52   6928.03416   
1468          Apparel       Clicked      270          7    860.68600   
1469          Apparel       Clicked       87         40   3016.54372   
1473          Apparel       Clicked       82        102   7356.85141   

      Cluster_no Segment_Label            Itemset  
1              2          Gold  Lifestyle, Office  
2              0        Silver  Lifestyle, Office  
4              0        Silver  Lifestyle, Office  
6              2          Gold  Lifestyle, Office  
8              2          Gold  Lifestyle, Office  
...          ...           ...                ...  
1463           0        Silver  Lifestyle, Office  
1464           2          Gold  Lifestyle, Office  
1468           0        Silver  Lifestyle, Office  
1469           0        Silver  Lifestyle, Office  
1473           2          Gold  Lifestyle, Office  

[932 rows x 15 columns]
Profiles linked to itemset ['Office', 'Lifestyle']:
      CustomerID Gender    Location  Quantity  Tenure         AOV        CLV1  \
0          12346      F    New York         3      31   87.490870  138.284186   
1          12347      M    New York       342      20  261.447399  413.232155   
2          12348      M  California       209      39   73.458954  116.105962   
4          12356      F     Chicago        56      31   55.761917   88.134810   
5          12359      M    New York        86      41   49.002075   77.450505   
...          ...    ...         ...       ...     ...         ...         ...   
1469       18260      M    New York       141      43   75.413593  119.195378   
1470       18269      M     Chicago        10      25   20.863437   32.975823   
1471       18277      F     Chicago         2      47  333.800000  527.589464   
1472       18277      F     Chicago         2      47  333.800000  527.589464   
1473       18283      F  New Jersey       155      36   72.125994  113.999145   

     Product Category Coupon Status  Recency  Frequency     Monetary  \
0             Apparel          Used      107          2    174.98174   
1            Nest-USA       Clicked       59         60  15686.84396   
2              Office       Clicked       73         23   1689.55594   
4             Apparel       Clicked      107         36   2007.42900   
5             Apparel       Clicked      130         16    784.03320   
...               ...           ...      ...        ...          ...   
1469          Apparel       Clicked       87         40   3016.54372   
1470          Apparel       Clicked      194          8    166.90750   
1471         Nest-USA      Not Used       69          1    333.80000   
1472         Nest-USA      Not Used      223          1    333.80000   
1473          Apparel       Clicked       82        102   7356.85141   

      Cluster_no Segment_Label            Itemset  
0              0        Silver  Office, Lifestyle  
1              2          Gold  Office, Lifestyle  
2              0        Silver  Office, Lifestyle  
4              0        Silver  Office, Lifestyle  
5              0        Silver  Office, Lifestyle  
...          ...           ...                ...  
1469           0        Silver  Office, Lifestyle  
1470           0        Silver  Office, Lifestyle  
1471           0        Silver  Office, Lifestyle  
1472           0        Silver  Office, Lifestyle  
1473           2          Gold  Office, Lifestyle  

[1203 rows x 15 columns]
In [98]:
from datetime import timedelta

# Function to find transactions for complex itemsets within the same day for the same customer
def find_transactions_for_itemset(itemset):
    itemset = list(itemset)  # Ensure itemset is a list
    if not itemset:
        return []
    possible_transactions = set(product_transaction_mapping[itemset[0]])
    for item in itemset[1:]:
        if item in product_transaction_mapping:
            possible_transactions.intersection_update(product_transaction_mapping[item])
        else:
            return []  # No transactions if any item is missing
    
    # Check transactions within the same day for the same customer
    same_day_transactions = []
    for transaction in possible_transactions:
        transaction_date = df.loc[df['Transaction_ID'] == transaction, 'Transaction_Date'].iloc[0]
        customer_id = df.loc[df['Transaction_ID'] == transaction, 'CustomerID'].iloc[0]
        same_day_transactions.extend(df[(df['CustomerID'] == customer_id) & 
                                         (df['Transaction_Date'] == transaction_date)]['Transaction_ID'].tolist())
    
    return list(set(same_day_transactions))  # Remove duplicates and return list

# Calculate the transactions associated with each antecedent itemset in the DataFrame and stores this information in a new column called 'Transactions'
association_rules_df['Transactions'] = association_rules_df['antecedents'].apply(find_transactions_for_itemset)

# Find CustomerIDs for these transactions
def find_customers_for_transactions(transactions):
    return df[df['Transaction_ID'].isin(transactions)]['CustomerID'].unique()

association_rules_df['CustomerIDs'] = association_rules_df['Transactions'].apply(find_customers_for_transactions)

# Include full itemset details when merging with customer profiles
def merge_with_profile(customer_ids, itemset):
    profiles = merged_profile[merged_profile['CustomerID'].isin(customer_ids)]
    profiles['Itemset'] = [', '.join(itemset)] * len(profiles)  # Convert itemset to a comma-separated string
    return profiles

def display_segment_itemset_associations():
    results = pd.DataFrame()  # Initialize an empty DataFrame to store results
    for index, row in association_rules_df.iterrows():
        itemset_example = list(row['antecedents']) + list(row['consequents'])  # Combine both antecedents and consequents
        profiles = merge_with_profile(row['CustomerIDs'], itemset_example)
        segment_itemset_count = profiles.groupby(['Segment_Label', 'Itemset']).size().reset_index(name='Count')
        results = pd.concat([results, segment_itemset_count], ignore_index=True)
    
    # Sorting and displaying the results
    results = results.sort_values(by=['Segment_Label', 'Count'], ascending=[True, False])
    print(results)

# Example usage
display_segment_itemset_associations()
   Segment_Label             Itemset  Count
3           Gold  Apparel, Drinkware    234
9           Gold  Apparel, Lifestyle    234
12          Gold     Apparel, Office    234
0           Gold  Drinkware, Apparel    233
18          Gold   Drinkware, Office    233
15          Gold     Office, Apparel    230
21          Gold   Office, Drinkware    230
27          Gold   Office, Lifestyle    230
6           Gold  Lifestyle, Apparel    226
24          Gold   Lifestyle, Office    226
1        Premium  Drinkware, Apparel      7
4        Premium  Apparel, Drinkware      7
7        Premium  Lifestyle, Apparel      7
10       Premium  Apparel, Lifestyle      7
13       Premium     Apparel, Office      7
16       Premium     Office, Apparel      7
19       Premium   Drinkware, Office      7
22       Premium   Office, Drinkware      7
25       Premium   Lifestyle, Office      7
28       Premium   Office, Lifestyle      7
5         Silver  Apparel, Drinkware   1162
11        Silver  Apparel, Lifestyle   1162
14        Silver     Apparel, Office   1162
17        Silver     Office, Apparel   1066
23        Silver   Office, Drinkware   1066
29        Silver   Office, Lifestyle   1066
2         Silver  Drinkware, Apparel    964
20        Silver   Drinkware, Office    964
8         Silver  Lifestyle, Apparel    866
26        Silver   Lifestyle, Office    866

Predict Next Purchase Date¶

In [99]:
from datetime import date 
df['Transaction_Date'] = pd.to_datetime(df['Transaction_Date']).dt.date

df_first9m = df[(df.Transaction_Date >= date(2019,1,1)) & (df.Transaction_Date < date(2019,9,1))].reset_index(drop=True)
df_last3m = df[(df.Transaction_Date >= date(2019,9,1)) & (df.Transaction_Date < date(2019,12,31))].reset_index(drop=True)
# Extract unique customer ID 
df_first9m_unique = pd.DataFrame(df_first9m['CustomerID'].unique())
df_first9m_unique.columns = ['CustomerID']
In [100]:
# Last purchase in the first 9 months
df_first9m_last_purchase = df_first9m.groupby('CustomerID').Transaction_Date.max().reset_index()
df_first9m_last_purchase.columns = ['CustomerID', 'LastPurchaseDate']

# First purchase from september
df_last3m_first_purchase = df_last3m.groupby('CustomerID').Transaction_Date.min().reset_index()
df_last3m_first_purchase.columns = ['CustomerID', 'FirstPurchaseDate']

# Merge
df_purchase_dates = pd.merge(df_first9m_last_purchase, df_last3m_first_purchase, on = 'CustomerID', how = 'left')

# Calculate next purchase date
df_purchase_dates['NextPurchaseDate'] = (df_purchase_dates['FirstPurchaseDate'] - df_purchase_dates['LastPurchaseDate']).dt.days
In [101]:
df_purchase_dates
Out[101]:
CustomerID LastPurchaseDate FirstPurchaseDate NextPurchaseDate
0 12347 2019-03-24 2019-11-01 222.0
1 12348 2019-06-22 2019-10-19 119.0
2 12359 2019-08-23 NaN NaN
3 12370 2019-06-25 NaN NaN
4 12377 2019-07-05 2019-11-21 139.0
... ... ... ... ...
1124 18256 2019-07-05 NaN NaN
1125 18259 2019-04-05 NaN NaN
1126 18260 2019-08-11 2019-10-05 55.0
1127 18269 2019-06-20 NaN NaN
1128 18283 2019-07-29 2019-10-10 73.0

1129 rows × 4 columns

In [102]:
# Extract customer ID from data frame of the first 9 months
df_user = pd.DataFrame(df_first9m['CustomerID'].unique())
df_user.columns = ['CustomerID']

# Add in column NextPurchaseDate from df_purchase_dates dataframe
df_user = pd.merge(df_user, df_purchase_dates[['CustomerID', 'NextPurchaseDate']], on = 'CustomerID', how = 'left')
df_user = df_user.fillna(999)
In [103]:
df_user
Out[103]:
CustomerID NextPurchaseDate
0 17850 999.0
1 13047 139.0
2 12583 999.0
3 15100 999.0
4 14688 255.0
... ... ...
1124 12928 999.0
1125 15749 999.0
1126 14209 999.0
1127 12728 999.0
1128 15569 999.0

1129 rows × 2 columns

RFM Scores¶

In [104]:
# Recency
df_max_purchase = df_first9m.groupby('CustomerID').Transaction_Date.max().reset_index()
df_max_purchase.columns = ['CustomerID', 'MaxPurchaseDate']
df_max_purchase['Recency'] = (df_max_purchase['MaxPurchaseDate'].max() - df_max_purchase['MaxPurchaseDate']).dt.days
In [105]:
df_user = pd.merge(df_user, df_max_purchase[['CustomerID', 'Recency']], on = 'CustomerID')
In [106]:
df_user.head()
Out[106]:
CustomerID NextPurchaseDate Recency
0 17850 999.0 217
1 13047 139.0 30
2 12583 999.0 29
3 15100 999.0 1
4 14688 255.0 241
In [107]:
df_user.Recency.describe()
Out[107]:
count    1129.000000
mean       91.518158
std        70.067431
min         0.000000
25%        30.000000
50%        78.000000
75%       143.000000
max       242.000000
Name: Recency, dtype: float64
In [108]:
# elbow curve
sse = {}
df_recency = df_user[['Recency']].copy()  # Create a copy of the DataFrame

for k in range(1, 10):
    kmeans = KMeans(n_clusters=k, max_iter=1000).fit(df_recency)
    df_recency["clusters"] = kmeans.labels_
    sse[k] = kmeans.inertia_

plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of Clusters")
plt.ylabel("SSE")
plt.show()
In [109]:
kmeans = KMeans(n_clusters=4)
kmeans.fit(df_user[['Recency']])
df_user['RecencyScore'] = kmeans.predict(df_user[['Recency']])
In [110]:
def order_cluster(cluster_field_name, target_field_name, df, ascending):
    new_cluster_field_name = "new_" + cluster_field_name
    df_new = df.groupby(cluster_field_name)[target_field_name].mean().reset_index()
    df_new = df_new.sort_values(by=target_field_name, ascending = ascending).reset_index(drop=True)
    df_new['index'] = df_new.index
    df_final = pd.merge(df, df_new[[cluster_field_name, 'index']], on = cluster_field_name)
    df_final = df_final.drop([cluster_field_name], axis = 1) 
    df_final = df_final.rename(columns=({"index":cluster_field_name}))
    return df_final

df_user = order_cluster('RecencyScore','Recency', df_user, False)
df_user
Out[110]:
CustomerID NextPurchaseDate Recency RecencyScore
0 17850 999.0 217 0
1 14688 255.0 241 0
2 13748 999.0 242 0
3 17548 999.0 240 0
4 13705 999.0 240 0
... ... ... ... ...
1124 14082 999.0 128 1
1125 16367 999.0 117 1
1126 17917 999.0 116 1
1127 12875 999.0 117 1
1128 16010 999.0 116 1

1129 rows × 4 columns

In [111]:
df_user.groupby('RecencyScore')['Recency'].describe()
Out[111]:
count mean std min 25% 50% 75% max
RecencyScore
0 152.0 217.184211 15.484845 184.0 210.0 218.0 230.0 242.0
1 238.0 148.096639 17.908202 116.0 134.0 146.0 163.0 182.0
2 297.0 83.047138 17.070139 54.0 70.0 81.0 99.0 115.0
3 442.0 23.529412 15.986113 0.0 9.0 22.0 37.0 53.0
In [112]:
# Frequency
df_frequency = df_first9m.groupby('CustomerID').Transaction_Date.count().reset_index()
df_frequency.columns = ['CustomerID', 'Frequency']
df_frequency.head()
Out[112]:
CustomerID Frequency
0 12347 31
1 12348 17
2 12359 16
3 12370 91
4 12377 43
In [113]:
df_user = pd.merge(df_user, df_frequency, on = 'CustomerID')
df_user.head()
Out[113]:
CustomerID NextPurchaseDate Recency RecencyScore Frequency
0 17850 999.0 217 0 297
1 14688 255.0 241 0 19
2 13748 999.0 242 0 1
3 17548 999.0 240 0 7
4 13705 999.0 240 0 10
In [114]:
sse = {}
df_frequency = df_user[['Frequency']].copy()  # Create a copy of the DataFrame

for k in range(1, 10):
    kmeans = KMeans(n_clusters=k, max_iter=1000).fit(df_frequency)
    df_recency["clusters"] = kmeans.labels_
    sse[k] = kmeans.inertia_

plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of Clusters")
plt.ylabel("SSE")
plt.show()
In [115]:
kmeans = KMeans(n_clusters=4)
kmeans.fit(df_user[['Frequency']])
df_user['FrequencyScore'] = kmeans.predict(df_user[['Frequency']])
In [116]:
df_user.groupby('FrequencyScore')['Frequency'].describe()
Out[116]:
count mean std min 25% 50% 75% max
FrequencyScore
0 271.0 50.287823 13.859019 32.0 38.00 48.0 60.00 83.0
1 82.0 117.329268 30.096575 84.0 92.25 112.0 136.25 210.0
2 769.0 13.085826 8.599626 1.0 5.00 13.0 19.00 31.0
3 7.0 385.285714 141.656526 261.0 309.50 346.0 393.50 684.0
In [117]:
df_user = order_cluster('FrequencyScore','Frequency', df_user, True)
df_user
Out[117]:
CustomerID NextPurchaseDate Recency RecencyScore Frequency FrequencyScore
0 17850 999.0 217 0 297 3
1 14606 18.0 7 3 360 3
2 15311 9.0 7 3 322 3
3 12748 41.0 36 3 684 3
4 14911 34.0 19 3 346 3
... ... ... ... ... ... ...
1124 13174 254.0 180 1 169 2
1125 12433 999.0 143 1 147 2
1126 17827 999.0 127 1 112 2
1127 17259 999.0 173 1 87 2
1128 17341 999.0 150 1 161 2

1129 rows × 6 columns

In [118]:
# Monetary Value
df_monetary = df_first9m.groupby('CustomerID').Invoice.sum().reset_index()
df_monetary.columns = ['CustomerID', 'Monetary']
df_monetary.head()
Out[118]:
CustomerID Monetary
0 12347 12901.28216
1 12348 878.21724
2 12359 784.03320
3 12370 8054.88702
4 12377 4219.86498
In [119]:
df_user = pd.merge(df_user, df_monetary, on = 'CustomerID')
df_user.head()
Out[119]:
CustomerID NextPurchaseDate Recency RecencyScore Frequency FrequencyScore Monetary
0 17850 999.0 217 0 297 3 40531.38476
1 14606 18.0 7 3 360 3 34610.78703
2 15311 9.0 7 3 322 3 62975.87976
3 12748 41.0 36 3 684 3 86143.00498
4 14911 34.0 19 3 346 3 38682.34453
In [120]:
sse = {}
df_monetary = df_user[['Monetary']].copy()  # Create a copy of the DataFrame

for k in range(1, 10):
    kmeans = KMeans(n_clusters=k, max_iter=1000).fit(df_monetary)
    df_recency["clusters"] = kmeans.labels_
    sse[k] = kmeans.inertia_

plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of Clusters")
plt.ylabel("SSE")
plt.show()
In [121]:
kmeans = KMeans(n_clusters=4)
kmeans.fit(df_user[['Monetary']])
df_user['MonetaryScore'] = kmeans.predict(df_user[['Monetary']])
In [122]:
df_user = order_cluster('MonetaryScore','Monetary', df_user, True)
df_user
Out[122]:
CustomerID NextPurchaseDate Recency RecencyScore Frequency FrequencyScore Monetary MonetaryScore
0 17850 999.0 217 0 297 3 40531.38476 2
1 14606 18.0 7 3 360 3 34610.78703 2
2 14911 34.0 19 3 346 3 38682.34453 2
3 17841 14.0 8 3 427 3 40768.85344 2
4 13081 999.0 0 3 261 3 23607.04965 2
... ... ... ... ... ... ... ... ...
1124 15426 999.0 169 1 87 2 7671.12216 1
1125 13174 254.0 180 1 169 2 16049.48846 1
1126 17827 999.0 127 1 112 2 8747.50404 1
1127 17259 999.0 173 1 87 2 6587.70300 1
1128 17341 999.0 150 1 161 2 13437.68995 1

1129 rows × 8 columns

In [123]:
df_user.groupby('MonetaryScore')['Monetary'].describe()
Out[123]:
count mean std min 25% 50% 75% max
MonetaryScore
0 917.0 1497.313441 1205.735670 7.32000 477.240000 1167.992100 2331.138900 4571.40105
1 196.0 7786.883740 2647.215751 4669.39420 5663.547540 7200.262005 8847.932535 16049.48846
2 14.0 26142.091861 8830.008688 17733.48544 19286.055310 22513.490325 33251.835132 40768.85344
3 2.0 74559.442370 16381.631344 62975.87976 68767.661065 74559.442370 80351.223675 86143.00498
In [124]:
# Overall segmentation
df_user['OverallScore']= df_user['RecencyScore'] + df_user['FrequencyScore'] + df_user['MonetaryScore']
df_user.groupby('OverallScore')[['Recency', 'Frequency', 'Monetary']].mean()
Out[124]:
Recency Frequency Monetary
OverallScore
0 219.421053 11.149123 1310.507012
1 150.454054 13.324324 1202.965619
2 101.170370 20.200000 2003.159898
3 44.126543 22.555556 1988.157034
4 40.902655 50.584071 4177.942595
5 47.283333 76.733333 8013.653154
6 23.222222 117.370370 10330.967894
7 24.333333 151.000000 19941.476807
8 8.500000 348.500000 34417.258662
9 21.500000 503.000000 74559.442370
In [125]:
df_user['Segment'] = 'Low-Value'
df_user.loc[df_user['OverallScore']>2, 'Segment']='Mid-Value'
df_user.loc[df_user['OverallScore']>5, 'Segment']='High-Value'
In [126]:
df_user
Out[126]:
CustomerID NextPurchaseDate Recency RecencyScore Frequency FrequencyScore Monetary MonetaryScore OverallScore Segment
0 17850 999.0 217 0 297 3 40531.38476 2 5 Mid-Value
1 14606 18.0 7 3 360 3 34610.78703 2 8 High-Value
2 14911 34.0 19 3 346 3 38682.34453 2 8 High-Value
3 17841 14.0 8 3 427 3 40768.85344 2 8 High-Value
4 13081 999.0 0 3 261 3 23607.04965 2 8 High-Value
... ... ... ... ... ... ... ... ... ... ...
1124 15426 999.0 169 1 87 2 7671.12216 1 4 Mid-Value
1125 13174 254.0 180 1 169 2 16049.48846 1 4 Mid-Value
1126 17827 999.0 127 1 112 2 8747.50404 1 4 Mid-Value
1127 17259 999.0 173 1 87 2 6587.70300 1 4 Mid-Value
1128 17341 999.0 150 1 161 2 13437.68995 1 4 Mid-Value

1129 rows × 10 columns

In [127]:
# Create a dataframe with CustomerID and Transaction_Date
df_day_order = df_first9m[['CustomerID', 'Transaction_Date']].copy()

# Convert Transaction_Date to day
df_day_order['TransactionDay'] = pd.to_datetime(df_first9m['Transaction_Date']).dt.date
df_day_order = df_day_order.sort_values(['CustomerID', 'Transaction_Date'])

# Drop duplicates
df_day_order = df_day_order.drop_duplicates(subset=['CustomerID', 'TransactionDay'], keep='first')
In [128]:
# shift last 3 purchase day
df_day_order['PrevPurchaseDate'] = df_day_order.groupby('CustomerID')['TransactionDay'].shift(1)
df_day_order['T2PurchaseDate'] = df_day_order.groupby('CustomerID')['TransactionDay'].shift(2)
df_day_order['T3PurchaseDate'] = df_day_order.groupby('CustomerID')['TransactionDay'].shift(3)
In [129]:
df_day_order
Out[129]:
CustomerID Transaction_Date TransactionDay PrevPurchaseDate T2PurchaseDate T3PurchaseDate
22739 12347 2019-03-24 2019-03-24 NaN NaN NaN
12512 12348 2019-06-22 2019-06-22 NaN NaN NaN
26948 12359 2019-08-23 2019-08-23 NaN NaN NaN
20783 12370 2019-05-26 2019-05-26 NaN NaN NaN
20866 12370 2019-06-25 2019-06-25 2019-05-26 NaN NaN
... ... ... ... ... ... ...
20061 18260 2019-06-22 2019-06-22 NaN NaN NaN
20072 18260 2019-08-11 2019-08-11 2019-06-22 NaN NaN
8295 18269 2019-04-05 2019-04-05 NaN NaN NaN
8302 18269 2019-06-20 2019-06-20 2019-04-05 NaN NaN
27039 18283 2019-07-29 2019-07-29 NaN NaN NaN

2197 rows × 6 columns

In [130]:
# Number of days difference between last 3 purchase
df_day_order['DayDiff1'] = (df_day_order['TransactionDay'] - df_day_order['PrevPurchaseDate']).dt.days
df_day_order['DayDiff2'] = (df_day_order['TransactionDay'] - df_day_order['T2PurchaseDate']).dt.days
df_day_order['DayDiff3'] = (df_day_order['TransactionDay'] - df_day_order['T3PurchaseDate']).dt.days
df_day_order.head()
Out[130]:
CustomerID Transaction_Date TransactionDay PrevPurchaseDate T2PurchaseDate T3PurchaseDate DayDiff1 DayDiff2 DayDiff3
22739 12347 2019-03-24 2019-03-24 NaN NaN NaN NaN NaN NaN
12512 12348 2019-06-22 2019-06-22 NaN NaN NaN NaN NaN NaN
26948 12359 2019-08-23 2019-08-23 NaN NaN NaN NaN NaN NaN
20783 12370 2019-05-26 2019-05-26 NaN NaN NaN NaN NaN NaN
20866 12370 2019-06-25 2019-06-25 2019-05-26 NaN NaN 30.0 NaN NaN
In [131]:
df_day_diff = df_day_order.groupby('CustomerID').agg({'DayDiff1':['mean', 'std']}).reset_index()
df_day_diff.columns = ['CustomerID', 'DayDiffMean', 'DayDiffStd']
df_day_diff.head()
Out[131]:
CustomerID DayDiffMean DayDiffStd
0 12347 NaN NaN
1 12348 NaN NaN
2 12359 NaN NaN
3 12370 30.0 NaN
4 12377 NaN NaN
In [132]:
df_day_order_last = df_day_order.drop_duplicates(subset=['CustomerID'], keep = 'last')
df_day_order_last = df_day_order_last.dropna()
df_day_order_last
Out[132]:
CustomerID Transaction_Date TransactionDay PrevPurchaseDate T2PurchaseDate T3PurchaseDate DayDiff1 DayDiff2 DayDiff3
2279 12471 2019-08-28 2019-08-28 2019-08-15 2019-07-28 2019-07-20 13.0 31.0 39.0
3446 12481 2019-08-02 2019-08-02 2019-07-10 2019-04-12 2019-04-11 23.0 112.0 113.0
10779 12494 2019-07-20 2019-07-20 2019-06-14 2019-06-13 2019-04-12 36.0 37.0 99.0
30443 12647 2019-06-26 2019-06-26 2019-06-09 2019-03-21 2019-02-25 17.0 97.0 121.0
12078 12681 2019-08-15 2019-08-15 2019-07-22 2019-07-09 2019-03-20 24.0 37.0 148.0
... ... ... ... ... ... ... ... ... ...
17201 18041 2019-07-31 2019-07-31 2019-07-30 2019-04-16 2019-01-19 1.0 106.0 193.0
15416 18055 2019-06-15 2019-06-15 2019-06-14 2019-03-19 2019-02-11 1.0 88.0 124.0
18749 18092 2019-08-29 2019-08-29 2019-07-14 2019-07-06 2019-04-11 46.0 54.0 140.0
14042 18109 2019-08-23 2019-08-23 2019-07-21 2019-05-02 2019-02-12 33.0 113.0 192.0
10196 18116 2019-07-16 2019-07-16 2019-07-15 2019-06-18 2019-06-17 1.0 28.0 29.0

118 rows × 9 columns

In [133]:
df_day_order_last = pd.merge(df_day_order_last, df_day_diff, on = 'CustomerID')
In [134]:
df_user = pd.merge(df_user, df_day_order_last[['CustomerID', 'DayDiff1', 'DayDiff2', 'DayDiff3', 'DayDiffMean', 'DayDiffStd' ]])
df_user.describe()
Out[134]:
CustomerID NextPurchaseDate Recency RecencyScore Frequency FrequencyScore Monetary MonetaryScore OverallScore DayDiff1 DayDiff2 DayDiff3 DayDiffMean DayDiffStd
count 118.000000 118.000000 118.000000 118.000000 118.000000 118.000000 118.000000 118.000000 118.000000 118.000000 118.000000 118.000000 118.000000 118.000000
mean 15265.635593 373.423729 32.949153 2.771186 90.559322 1.254237 9159.489043 0.711864 4.737288 26.610169 60.508475 103.703390 37.572460 35.809408
std 1675.704606 434.726644 36.669661 0.529651 93.810642 0.859203 11761.162986 0.705312 1.565810 30.447667 41.136576 56.838248 16.708565 19.919037
min 12471.000000 1.000000 0.000000 0.000000 6.000000 0.000000 395.882300 0.000000 1.000000 1.000000 2.000000 3.000000 1.000000 0.000000
25% 13782.250000 56.000000 8.000000 3.000000 35.250000 1.000000 2837.539815 0.000000 3.000000 2.000000 29.250000 57.000000 26.437500 20.752457
50% 15103.500000 103.500000 18.000000 3.000000 61.500000 1.000000 6131.462435 1.000000 5.000000 16.000000 50.500000 93.500000 37.000000 33.549336
75% 16764.000000 999.000000 44.750000 3.000000 115.000000 2.000000 10861.316430 1.000000 6.000000 38.750000 88.750000 148.750000 49.000000 45.539934
max 18116.000000 999.000000 217.000000 3.000000 684.000000 3.000000 86143.004980 3.000000 9.000000 146.000000 198.000000 227.000000 75.666667 92.376043

Create a copy dataframe & run models¶

In [135]:
df_npd = df_user.copy()
df_npd = pd.get_dummies(df_npd)
In [136]:
# Define categories for dependent variable
df_npd['NextPurchaseDayRange'] = '0-30 days'
df_npd.loc[(df_npd['NextPurchaseDate'] > 30) & (df_npd['NextPurchaseDate'] <= 60), 'NextPurchaseDayRange'] = '30-60 days'
df_npd.loc[(df_npd['NextPurchaseDate'] > 60) & (df_npd['NextPurchaseDate'] <= 90), 'NextPurchaseDayRange'] = '60-90 days'
df_npd.loc[df_npd['NextPurchaseDate'] > 90, 'NextPurchaseDayRange'] = '90+ days'
In [137]:
df_npd.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 118 entries, 0 to 117
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   CustomerID            118 non-null    int64  
 1   NextPurchaseDate      118 non-null    float64
 2   Recency               118 non-null    int64  
 3   RecencyScore          118 non-null    int64  
 4   Frequency             118 non-null    int64  
 5   FrequencyScore        118 non-null    int64  
 6   Monetary              118 non-null    float64
 7   MonetaryScore         118 non-null    int64  
 8   OverallScore          118 non-null    int64  
 9   DayDiff1              118 non-null    float64
 10  DayDiff2              118 non-null    float64
 11  DayDiff3              118 non-null    float64
 12  DayDiffMean           118 non-null    float64
 13  DayDiffStd            118 non-null    float64
 14  Segment_High-Value    118 non-null    uint8  
 15  Segment_Low-Value     118 non-null    uint8  
 16  Segment_Mid-Value     118 non-null    uint8  
 17  NextPurchaseDayRange  118 non-null    object 
dtypes: float64(7), int64(7), object(1), uint8(3)
memory usage: 15.1+ KB
In [138]:
# Drop unnecessary variables to avoid multicollinearity
columns_to_keep = ['CustomerID', 'NextPurchaseDayRange','Segment_High-Value', 'Segment_Low-Value', 'Segment_Mid-Value', 
                   'DayDiffMean', 'DayDiffStd', 'OverallScore']
df_npd = df_npd[columns_to_keep]
In [139]:
df_npd
Out[139]:
CustomerID NextPurchaseDayRange Segment_High-Value Segment_Low-Value Segment_Mid-Value DayDiffMean DayDiffStd OverallScore
0 17850 90+ days 0 0 1 2.777778 3.527668 5
1 14606 0-30 days 1 0 0 14.666667 17.128367 8
2 14911 30-60 days 1 0 0 12.529412 13.505173 8
3 17841 0-30 days 1 0 0 18.666667 15.310919 8
4 13081 90+ days 1 0 0 40.600000 70.009285 8
... ... ... ... ... ... ... ... ...
113 16782 90+ days 0 0 1 26.333333 15.044379 5
114 14796 90+ days 0 0 1 15.666667 25.403412 5
115 15570 60-90 days 0 0 1 25.800000 34.426734 5
116 15356 90+ days 0 0 1 23.666667 39.259818 5
117 17827 90+ days 0 0 1 1.000000 0.000000 4

118 rows × 8 columns

In [140]:
# Split the data into features and target
feature_cols = ['Segment_High-Value', 'Segment_Low-Value', 'Segment_Mid-Value', 
                   'DayDiffMean', 'DayDiffStd', 'OverallScore']
X = df_npd[feature_cols]
y = df_npd['NextPurchaseDayRange']  
In [141]:
# view the percantage of each class
class_counts = df_npd.NextPurchaseDayRange.value_counts()/len(df_npd)*100

# Visualize
# Set style
sb.set_style("whitegrid")

# Set font
plt.rcParams['font.family'] = 'serif'

# Plot the bar plot
plt.figure(figsize=(10, 6))
sb.barplot(x=class_counts.index, y=class_counts.values, color = "#0059b3")

# Add labels and title
plt.title('Proportion of Each Class in Next Purchase Day Range', fontsize=16)
plt.xlabel('Next Purchase Day Range', fontsize=14)
plt.ylabel('Percentage', fontsize=14)

# Add percentages to the bars
for i, v in enumerate(class_counts.values):
    plt.text(i, v + 0.5, f'{v:.2f}%', ha='center', va='bottom', fontsize=12, color='black')

# Rotate x-axis labels for better readability
plt.xticks(rotation=45)

# Show the plot
plt.tight_layout()
plt.show()
In [142]:
# Apply SMOTE
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X, y)

# Convert back to DataFrame
df_resampled = pd.DataFrame(X_resampled, columns=X.columns)

Logistic¶

In [144]:
# Splitting the dataset into training (50%) and testing (50%)
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.5, random_state=42)

# Create the model, using the 'lbfgs' solver for multinomial logistic regression
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)

# Fit the model to the training data
y_score = model.fit(X_train, y_train).predict_proba(X_test)

# Predict the labels for the test set
y_pred = model.predict(X_test)

# Calculate and print the accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
Accuracy: 0.37

Random Forest¶

In [145]:
# Initialize the Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the classifier to the training data
rf.fit(X_train, y_train)

# Predict on the test data
y_pred = rf.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classif_report = classification_report(y_test, y_pred)

# Print the results
print(f"Random Forest Model Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classif_report)
Random Forest Model Accuracy: 0.58
Confusion Matrix:
[[29  1  2  3]
 [ 2 21  6  3]
 [ 8  5 17  2]
 [10 10  4 11]]
Classification Report:
              precision    recall  f1-score   support

   0-30 days       0.59      0.83      0.69        35
  30-60 days       0.57      0.66      0.61        32
  60-90 days       0.59      0.53      0.56        32
    90+ days       0.58      0.31      0.41        35

    accuracy                           0.58       134
   macro avg       0.58      0.58      0.57       134
weighted avg       0.58      0.58      0.57       134

XGBoost¶

In [146]:
# Encode string class labels to integers
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Fit the XGBoost model using the encoded labels
xgb_classifier = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_classifier.fit(X_train, y_train_encoded)

# Predict on the test data
y_pred_encoded = xgb_classifier.predict(X_test)
y_pred = label_encoder.inverse_transform(y_pred_encoded) # Convert predictions back to original labels

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classif_report = classification_report(y_test, y_pred)

# Print the results
print(f"XGBoost Model Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classif_report)
XGBoost Model Accuracy: 0.57
Confusion Matrix:
[[31  1  1  2]
 [ 5 18  5  4]
 [ 9  3 18  2]
 [ 8 12  5 10]]
Classification Report:
              precision    recall  f1-score   support

   0-30 days       0.58      0.89      0.70        35
  30-60 days       0.53      0.56      0.55        32
  60-90 days       0.62      0.56      0.59        32
    90+ days       0.56      0.29      0.38        35

    accuracy                           0.57       134
   macro avg       0.57      0.57      0.55       134
weighted avg       0.57      0.57      0.55       134